[PATCH] D93031: Enable fexec-charset option

Abhina Sree via Phabricator via cfe-commits Fri, 09 Apr 2021 05:39:53 -0700

abhina.sreeskantharajan updated this revision to Diff 336416.
abhina.sreeskantharajan added a comment.


Rebase + fix CharLiteralParser endian issue by saving the char to a char 
variable first and then creating a StringRef


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93031/new/

https://reviews.llvm.org/D93031

Files:
  clang/include/clang/Basic/LangOptions.h
  clang/include/clang/Basic/TokenKinds.h
  clang/include/clang/Driver/Options.td
  clang/include/clang/Lex/LiteralConverter.h
  clang/include/clang/Lex/LiteralSupport.h
  clang/include/clang/Lex/Preprocessor.h
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/lib/Frontend/CompilerInstance.cpp
  clang/lib/Lex/CMakeLists.txt
  clang/lib/Lex/LiteralConverter.cpp
  clang/lib/Lex/LiteralSupport.cpp
  clang/test/CodeGen/systemz-charset.c
  clang/test/CodeGen/systemz-charset.cpp
  clang/test/Driver/cl-options.c
  clang/test/Driver/clang_f_opts.c
  llvm/cmake/config-ix.cmake
  llvm/include/llvm/ADT/Triple.h
  llvm/include/llvm/Config/config.h.cmake
  llvm/include/llvm/Support/CharSet.h
  llvm/lib/Support/CMakeLists.txt
  llvm/lib/Support/CharSet.cpp
  llvm/lib/Support/Triple.cpp
  llvm/unittests/Support/CMakeLists.txt
  llvm/unittests/Support/CharSetTest.cpp

Index: llvm/unittests/Support/CharSetTest.cpp
===================================================================
--- /dev/null
+++ llvm/unittests/Support/CharSetTest.cpp
@@ -0,0 +1,191 @@
+//===- unittests/Support/CharSetTest.cpp - Charset conversion tests -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+
+namespace {
+
+// String "Hello World!"
+static const char HelloA[] =
+    "\x48\x65\x6C\x6C\x6F\x20\x57\x6F\x72\x6C\x64\x21\x0a";
+static const char HelloE[] =
+    "\xC8\x85\x93\x93\x96\x40\xE6\x96\x99\x93\x84\x5A\x15";
+
+// String "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+static const char ABCStrA[] =
+    "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F\x50\x51\x52"
+    "\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6A"
+    "\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7A";
+static const char ABCStrE[] =
+    "\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9"
+    "\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\x81\x82\x83\x84\x85\x86\x87\x88\x89\x91"
+    "\x92\x93\x94\x95\x96\x97\x98\x99\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9";
+
+// String "Â¡Â¢Â£AÃÃÃEÃÃÃaÃ Ã¡Ã¢Ã£Ã¤eÃ¨Ã©ÃªÃ«"
+static const char AccentUTF[] =
+    "\xc2\xa1\xc2\xa2\xc2\xa3\x41\xc3\x84\xc3\x85\xc3\x86\x45\xc3\x88\xc3\x89"
+    "\xc3\x8a\x61\xc3\xa0\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\x65\xc3\xa8\xc3\xa9"
+    "\xc3\xaa\xc3\xab";
+static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72"
+                              "\x81\x44\x45\x42\x46\x43\x85\x54\x51\x52\x53";
+
+TEST(CharSet, FromASCII) {
+  // Hello string.
+  StringRef Src(HelloA);
+  SmallString<64> Dst;
+
+  CharSetConverter Conv = CharSetConverter::create(
+      CharSetConverter::CS_LATIN1, CharSetConverter::CS_IBM1047);
+  std::error_code EC = Conv.convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(HelloE, static_cast<std::string>(Dst).c_str());
+
+  // ABC string.
+  Src = ABCStrA;
+  Dst.clear();
+  EC = Conv.convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(ABCStrE, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, ToASCII) {
+  // Hello string.
+  StringRef Src(HelloE);
+  SmallString<64> Dst;
+
+  CharSetConverter Conv = CharSetConverter::create(CharSetConverter::CS_IBM1047,
+                                                   CharSetConverter::CS_LATIN1);
+  std::error_code EC = Conv.convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(HelloA, static_cast<std::string>(Dst).c_str());
+
+  // ABC string.
+  Src = ABCStrE;
+  Dst.clear();
+  EC = Conv.convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(ABCStrA, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, FromUTF8) {
+  // Hello string.
+  StringRef Src(HelloA);
+  SmallString<64> Dst;
+
+  CharSetConverter Conv = CharSetConverter::create(
+      CharSetConverter::CS_UTF8, CharSetConverter::CS_IBM1047);
+  std::error_code EC = Conv.convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(HelloE, static_cast<std::string>(Dst).c_str());
+
+  // ABC string.
+  Src = ABCStrA;
+  Dst.clear();
+  EC = Conv.convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(ABCStrE, static_cast<std::string>(Dst).c_str());
+
+  // Accent string.
+  Src = AccentUTF;
+  Dst.clear();
+  EC = Conv.convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(AccentE, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, ToUTF8) {
+  // Hello string.
+  StringRef Src(HelloE);
+  SmallString<64> Dst;
+
+  CharSetConverter Conv = CharSetConverter::create(CharSetConverter::CS_IBM1047,
+                                                   CharSetConverter::CS_UTF8);
+  std::error_code EC = Conv.convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(HelloA, static_cast<std::string>(Dst).c_str());
+
+  // ABC string.
+  Src = ABCStrE;
+  Dst.clear();
+  EC = Conv.convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(ABCStrA, static_cast<std::string>(Dst).c_str());
+
+  // Accent string.
+  Src = AccentE;
+  Dst.clear();
+  EC = Conv.convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(AccentUTF, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, Identity) {
+  // Hello string.
+  StringRef Src(HelloA);
+  SmallString<64> Dst;
+
+  CharSetConverter Conv = CharSetConverter::create(CharSetConverter::CS_LATIN1,
+                                                   CharSetConverter::CS_LATIN1);
+  std::error_code EC = Conv.convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(HelloA, static_cast<std::string>(Dst).c_str());
+
+  // ABC string.
+  Src = ABCStrA;
+  Dst.clear();
+  EC = Conv.convert(Src, Dst);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(ABCStrA, static_cast<std::string>(Dst).c_str());
+}
+
+TEST(CharSet, RoundTrip) {
+  ErrorOr<CharSetConverter> ConvToUTF16 =
+      CharSetConverter::create("IBM-1047", "UTF-16");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToUTF16) {
+    ASSERT_EQ(ConvToUTF16.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+  ErrorOr<CharSetConverter> ConvToUTF32 =
+      CharSetConverter::create("UTF-16", "UTF-32");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToUTF32) {
+    ASSERT_EQ(ConvToUTF32.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+  ErrorOr<CharSetConverter> ConvToEBCDIC =
+      CharSetConverter::create("UTF-32", "IBM-1047");
+  // Stop test if conversion is not supported (no underlying iconv support).
+  if (!ConvToEBCDIC) {
+    ASSERT_EQ(ConvToEBCDIC.getError(),
+              std::make_error_code(std::errc::invalid_argument));
+    return;
+  }
+
+  // Setup source string.
+  char SrcStr[256];
+  for (size_t I = 0; I < 256; ++I)
+    SrcStr[I] = (I + 1) % 256;
+
+  SmallString<99> Dst1Str, Dst2Str, Dst3Str;
+
+  std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str);
+  EXPECT_TRUE(!EC);
+  EC = ConvToUTF32->convert(Dst1Str, Dst2Str);
+  EXPECT_TRUE(!EC);
+  EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str);
+  EXPECT_TRUE(!EC);
+  EXPECT_STREQ(SrcStr, static_cast<std::string>(Dst3Str).c_str());
+}
+
+} // namespace
Index: llvm/unittests/Support/CMakeLists.txt
===================================================================
--- llvm/unittests/Support/CMakeLists.txt
+++ llvm/unittests/Support/CMakeLists.txt
@@ -14,6 +14,7 @@
   BlockFrequencyTest.cpp
   BranchProbabilityTest.cpp
   CachePruningTest.cpp
+  CharSetTest.cpp
   CrashRecoveryTest.cpp
   Casting.cpp
   CheckedArithmeticTest.cpp
Index: llvm/lib/Support/Triple.cpp
===================================================================
--- llvm/lib/Support/Triple.cpp
+++ llvm/lib/Support/Triple.cpp
@@ -1046,6 +1046,13 @@
   return Tmp.split('-').second;                      // Strip second component
 }
 
+// System charset on z/OS is IBM-1047 and UTF-8 otherwise
+StringRef Triple::getSystemCharset() const {
+  if (getOS() == llvm::Triple::ZOS)
+    return "IBM-1047";
+  return "UTF-8";
+}
+
 static unsigned EatNumber(StringRef &Str) {
   assert(!Str.empty() && isDigit(Str[0]) && "Not a number");
   unsigned Result = 0;
Index: llvm/lib/Support/CharSet.cpp
===================================================================
--- /dev/null
+++ llvm/lib/Support/CharSet.cpp
@@ -0,0 +1,203 @@
+//===-- CharSet.cpp - Utility class to convert between char sets --*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility classes to convert between different character
+/// set encoding.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CharSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include <algorithm>
+#include <system_error>
+
+#ifdef HAVE_ICONV
+#include <iconv.h>
+#endif
+
+using namespace llvm;
+
+namespace {
+
+// Maps the charset name to enum constant if possible.
+Optional<CharSetConverter::CharSetNames> getKnownCharSet(StringRef CSName) {
+#define CSNAME(CS, STR)                                                        \
+  if (CSName == STR)                                                           \
+  return CS
+  CSNAME(CharSetConverter::CS_UTF8, "UTF-8");
+  CSNAME(CharSetConverter::CS_LATIN1, "ISO8859-1");
+  CSNAME(CharSetConverter::CS_IBM1047, "IBM-1047");
+#undef CSNAME
+  return None;
+}
+
+// Character conversion between Enhanced ASCII and EBCDIC (IBM-1047).
+const unsigned char ISO88591ToIBM1047[256] = {
+    0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x15, 0x0b,
+    0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26,
+    0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f, 0x40, 0x5a, 0x7f, 0x7b,
+    0x5b, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61,
+    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e,
+    0x4c, 0x7e, 0x6e, 0x6f, 0x7c, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+    0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xe2,
+    0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xad, 0xe0, 0xbd, 0x5f, 0x6d,
+    0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92,
+    0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6,
+    0xa7, 0xa8, 0xa9, 0xc0, 0x4f, 0xd0, 0xa1, 0x07, 0x20, 0x21, 0x22, 0x23,
+    0x24, 0x25, 0x06, 0x17, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x09, 0x0a, 0x1b,
+    0x30, 0x31, 0x1a, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3a, 0x3b,
+    0x04, 0x14, 0x3e, 0xff, 0x41, 0xaa, 0x4a, 0xb1, 0x9f, 0xb2, 0x6a, 0xb5,
+    0xbb, 0xb4, 0x9a, 0x8a, 0xb0, 0xca, 0xaf, 0xbc, 0x90, 0x8f, 0xea, 0xfa,
+    0xbe, 0xa0, 0xb6, 0xb3, 0x9d, 0xda, 0x9b, 0x8b, 0xb7, 0xb8, 0xb9, 0xab,
+    0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9e, 0x68, 0x74, 0x71, 0x72, 0x73,
+    0x78, 0x75, 0x76, 0x77, 0xac, 0x69, 0xed, 0xee, 0xeb, 0xef, 0xec, 0xbf,
+    0x80, 0xfd, 0xfe, 0xfb, 0xfc, 0xba, 0xae, 0x59, 0x44, 0x45, 0x42, 0x46,
+    0x43, 0x47, 0x9c, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57,
+    0x8c, 0x49, 0xcd, 0xce, 0xcb, 0xcf, 0xcc, 0xe1, 0x70, 0xdd, 0xde, 0xdb,
+    0xdc, 0x8d, 0x8e, 0xdf};
+
+const unsigned char IBM1047ToISO88591[256] = {
+    0x00, 0x01, 0x02, 0x03, 0x9c, 0x09, 0x86, 0x7f, 0x97, 0x8d, 0x8e, 0x0b,
+    0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x9d, 0x0a, 0x08, 0x87,
+    0x18, 0x19, 0x92, 0x8f, 0x1c, 0x1d, 0x1e, 0x1f, 0x80, 0x81, 0x82, 0x83,
+    0x84, 0x85, 0x17, 0x1b, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x05, 0x06, 0x07,
+    0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, 0x98, 0x99, 0x9a, 0x9b,
+    0x14, 0x15, 0x9e, 0x1a, 0x20, 0xa0, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5,
+    0xe7, 0xf1, 0xa2, 0x2e, 0x3c, 0x28, 0x2b, 0x7c, 0x26, 0xe9, 0xea, 0xeb,
+    0xe8, 0xed, 0xee, 0xef, 0xec, 0xdf, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e,
+    0x2d, 0x2f, 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5, 0xc7, 0xd1, 0xa6, 0x2c,
+    0x25, 0x5f, 0x3e, 0x3f, 0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf,
+    0xcc, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22, 0xd8, 0x61, 0x62, 0x63,
+    0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xab, 0xbb, 0xf0, 0xfd, 0xfe, 0xb1,
+    0xb0, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0xaa, 0xba,
+    0xe6, 0xb8, 0xc6, 0xa4, 0xb5, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
+    0x79, 0x7a, 0xa1, 0xbf, 0xd0, 0x5b, 0xde, 0xae, 0xac, 0xa3, 0xa5, 0xb7,
+    0xa9, 0xa7, 0xb6, 0xbc, 0xbd, 0xbe, 0xdd, 0xa8, 0xaf, 0x5d, 0xb4, 0xd7,
+    0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0xad, 0xf4,
+    0xf6, 0xf2, 0xf3, 0xf5, 0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
+    0x51, 0x52, 0xb9, 0xfb, 0xfc, 0xf9, 0xfa, 0xff, 0x5c, 0xf7, 0x53, 0x54,
+    0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5,
+    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xb3, 0xdb,
+    0xdc, 0xd9, 0xda, 0x9f};
+
+enum { NoUTF = 0x0, SrcIsUTF = 0x1, DstIsUTF = 0x2 };
+
+std::error_code convertWithTable(const unsigned char *Table, unsigned Flags,
+                                 StringRef Source,
+                                 SmallVectorImpl<char> &Result) {
+  const unsigned char *Ptr =
+      reinterpret_cast<const unsigned char *>(Source.data());
+  size_t Length = Source.size();
+  while (Length--) {
+    unsigned char Ch = *Ptr++;
+    // Handle UTF-8 2-byte-sequences in input.
+    if (Flags & SrcIsUTF) {
+      if (Ch >= 128) {
+        // Only two-byte sequences can be decoded.
+        if (Ch != 0xc2 && Ch != 0xc3)
+          return std::make_error_code(std::errc::illegal_byte_sequence);
+        // Is buffer truncated?
+        if (!Length)
+          return std::make_error_code(std::errc::invalid_argument);
+        unsigned char Ch2 = *Ptr++;
+        // Is second byte well-formed?
+        if ((Ch2 & 0xc0) != 0x80)
+          return std::make_error_code(std::errc::illegal_byte_sequence);
+        Ch = Ch2 | (Ch << 6);
+        Length--;
+      }
+    }
+    // Translate the character.
+    Ch = Table ? Table[Ch] : Ch;
+    // Handle UTF-8 2-byte-sequences in output.
+    if (Flags & DstIsUTF) {
+      if (Ch >= 128) {
+        // First byte prefixed with either 0xc2 or 0xc3.
+        Result.push_back(static_cast<char>(0xc0 | (Ch >> 6)));
+        // Second byte is either the same as the ASCII byte or ASCII byte -64.
+        Ch = Ch & 0xbf;
+      }
+    }
+    Result.push_back(static_cast<char>(Ch));
+  }
+  return std::error_code();
+}
+
+#ifdef HAVE_ICONV
+std::error_code convertWithIconv(iconv_t ConvDesc, StringRef Source,
+                                 SmallVectorImpl<char> &Result) {
+  // Setup the input.
+  size_t InputLength = Source.size();
+  char *Input = const_cast<char *>(Source.data());
+  // Setup the output. We directly write into the SmallVector.
+  size_t Capacity = Result.capacity();
+  Result.resize(Capacity);
+  char *Output = static_cast<char *>(Result.data());
+  size_t OutputLength = Capacity;
+
+  while (iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength) ==
+         static_cast<size_t>(-1)) {
+    if (errno == E2BIG) {
+      // No space left in output buffer. Double the size of the underlying
+      // memory in the SmallVectorImpl, adjust pointer and length and continue
+      // the conversion.
+      const size_t Used = Capacity - OutputLength;
+      Capacity *= 2;
+      Result.resize(Capacity);
+      Output = static_cast<char *>(Result.data()) + Used;
+      OutputLength = Capacity - Used;
+    } else
+      // Some other error occured.
+      return std::error_code(errno, std::generic_category());
+  }
+
+  // Re-adjust size to actual size.
+  Result.resize(Capacity - OutputLength);
+  return std::error_code();
+}
+#endif
+} // namespace
+
+CharSetConverter CharSetConverter::create(CharSetNames CSFrom,
+                                          CharSetNames CSTo) {
+  unsigned Flags = NoUTF;
+  if (CSFrom == CS_UTF8)
+    Flags |= SrcIsUTF;
+  if (CSTo == CS_UTF8)
+    Flags |= DstIsUTF;
+  const unsigned char *Table = nullptr;
+  if (CSFrom == CS_IBM1047)
+    Table = IBM1047ToISO88591;
+  if (CSTo == CS_IBM1047)
+    Table = ISO88591ToIBM1047;
+  return CharSetConverter{
+      [Table, Flags](StringRef Source, SmallVectorImpl<char> &Result) {
+        return convertWithTable(Table, Flags, Source, Result);
+      },
+      nullptr};
+}
+
+ErrorOr<CharSetConverter> CharSetConverter::create(StringRef CSFrom,
+                                                   StringRef CSTo) {
+  Optional<CharSetConverter::CharSetNames> From = getKnownCharSet(CSFrom);
+  Optional<CharSetConverter::CharSetNames> To = getKnownCharSet(CSTo);
+  if (From && To)
+    return create(*From, *To);
+#ifdef HAVE_ICONV
+  iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str());
+  if (ConvDesc == reinterpret_cast<iconv_t>(-1))
+    return std::error_code(errno, std::generic_category());
+  return CharSetConverter{
+      [ConvDesc](StringRef Source, SmallVectorImpl<char> &Result) {
+        return convertWithIconv(ConvDesc, Source, Result);
+      },
+      [ConvDesc]() { iconv_close(ConvDesc); }};
+#endif
+  return std::make_error_code(std::errc::invalid_argument);
+}
\ No newline at end of file
Index: llvm/lib/Support/CMakeLists.txt
===================================================================
--- llvm/lib/Support/CMakeLists.txt
+++ llvm/lib/Support/CMakeLists.txt
@@ -53,6 +53,11 @@
   set(system_libs ${system_libs} ${Z3_LIBRARIES})
 endif()
 
+# Link iconv library if it is an external library.
+if(Iconv_FOUND AND NOT Iconv_IS_BUILT_IN)
+  set(system_libs ${system_libs} ${Iconv_LIBRARIES})
+endif()
+
 # Override the C runtime allocator on Windows and embed it into LLVM tools & libraries
 if(LLVM_INTEGRATED_CRT_ALLOC)
   if (CMAKE_BUILD_TYPE AND NOT ${LLVM_USE_CRT_${uppercase_CMAKE_BUILD_TYPE}} MATCHES "^(MT|MTd)$")
@@ -102,6 +107,7 @@
   BuryPointer.cpp
   CachePruning.cpp
   circular_raw_ostream.cpp
+  CharSet.cpp
   Chrono.cpp
   COM.cpp
   CodeGenCoverage.cpp
Index: llvm/include/llvm/Support/CharSet.h
===================================================================
--- /dev/null
+++ llvm/include/llvm/Support/CharSet.h
@@ -0,0 +1,117 @@
+//===-- CharSet.h - Utility class to convert between char sets ----*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a utility class to convert between different character
+/// set encodings.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_CHARSET_H
+#define LLVM_SUPPORT_CHARSET_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/ErrorOr.h"
+
+#include <functional>
+#include <string>
+#include <system_error>
+
+namespace llvm {
+
+template <typename T> class SmallVectorImpl;
+
+/// Utility class to convert between different character set encodings.
+/// The class always supports converting between EBCDIC 1047 and Latin-1/UTF-8.
+/// If the iconv library is available, then arbitrary conversions are supported.
+/// TODO Add Windows support.
+class CharSetConverter {
+public:
+  using ConverterFunc =
+      std::function<std::error_code(StringRef, SmallVectorImpl<char> &)>;
+  using CleanupFunc = std::function<void(void)>;
+
+private:
+  ConverterFunc Convert;
+  CleanupFunc Cleanup;
+
+public:
+  enum CharSetNames {
+    /// UTF-8 character set encoding.
+    CS_UTF8,
+
+    /// ISO 8859-1 (Latin-1) character set encoding.
+    CS_LATIN1,
+
+    /// IBM EBCDIC 1047 character set encoding.
+    CS_IBM1047
+  };
+
+private:
+  CharSetConverter(ConverterFunc Convert, CleanupFunc Cleanup)
+      : Convert(Convert), Cleanup(Cleanup) {}
+
+public:
+  /// Creates a CharSetConverter instance.
+  /// \param[in] CSFrom name of the source character encoding
+  /// \param[in] CSTo name of the target character encoding
+  /// \return a CharSetConverter instance
+  static CharSetConverter create(CharSetNames CSFrom, CharSetNames CSTo);
+
+  /// Creates a CharSetConverter instance.
+  /// Returns std::errc::invalid_argument in case the requested conversion is
+  /// not supported.
+  /// \param[in] CSFrom name of the source character encoding
+  /// \param[in] CSTo name of the target character encoding
+  /// \return a CharSetConverter instance or an error code
+  static ErrorOr<CharSetConverter> create(StringRef CSFrom, StringRef CSTo);
+
+  CharSetConverter(const CharSetConverter &) = delete;
+  CharSetConverter &operator=(const CharSetConverter &) = delete;
+
+  CharSetConverter(CharSetConverter &&Other) {
+    this->Convert = Other.Convert;
+    this->Cleanup = Other.Cleanup;
+    Other.Cleanup = nullptr;
+  }
+
+  CharSetConverter &operator=(CharSetConverter &&Other) {
+    this->Convert = Other.Convert;
+    this->Cleanup = Other.Cleanup;
+    Other.Cleanup = nullptr;
+    return *this;
+  }
+
+  ~CharSetConverter() {
+    if (Cleanup)
+      Cleanup();
+  }
+
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[in,out] Result container for converted string
+  /// \return error code in case something went wrong
+  std::error_code convert(StringRef Source,
+                          SmallVectorImpl<char> &Result) const {
+    return Convert(Source, Result);
+  }
+
+  /// Converts a string.
+  /// \param[in] Source source string
+  /// \param[in,out] Result container for converted string
+  /// \return error code in case something went wrong
+  std::error_code convert(const std::string &Source,
+                          SmallVectorImpl<char> &Result) const {
+    return convert(StringRef(Source), Result);
+  }
+};
+
+} // namespace llvm
+
+#endif
Index: llvm/include/llvm/Config/config.h.cmake
===================================================================
--- llvm/include/llvm/Config/config.h.cmake
+++ llvm/include/llvm/Config/config.h.cmake
@@ -97,6 +97,9 @@
 /* Define to 1 if you have the `getrusage' function. */
 #cmakedefine HAVE_GETRUSAGE ${HAVE_GETRUSAGE}
 
+/* Define to 1 if you have the iconv library functions. */
+#cmakedefine HAVE_ICONV ${HAVE_ICONV}
+
 /* Define to 1 if you have the `isatty' function. */
 #cmakedefine HAVE_ISATTY 1
 
Index: llvm/include/llvm/ADT/Triple.h
===================================================================
--- llvm/include/llvm/ADT/Triple.h
+++ llvm/include/llvm/ADT/Triple.h
@@ -397,6 +397,9 @@
   /// if the environment component is present).
   StringRef getOSAndEnvironmentName() const;
 
+  /// getSystemCharset - Get the system charset of the triple.
+  StringRef getSystemCharset() const;
+
   /// @}
   /// @name Convenience Predicates
   /// @{
Index: llvm/cmake/config-ix.cmake
===================================================================
--- llvm/cmake/config-ix.cmake
+++ llvm/cmake/config-ix.cmake
@@ -194,6 +194,14 @@
   set(XAR_LIB xar)
 endif()
 
+# Check for iconv.
+find_package(Iconv)
+if(Iconv_FOUND)
+  set(HAVE_ICONV 1)
+else()
+  set(HAVE_ICONV 0)
+endif()
+
 # function checks
 check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
 find_package(Backtrace)
Index: clang/test/Driver/clang_f_opts.c
===================================================================
--- clang/test/Driver/clang_f_opts.c
+++ clang/test/Driver/clang_f_opts.c
@@ -209,8 +209,14 @@
 // RUN: %clang -### -S -finput-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-CHARSET %s
 // CHECK-INVALID-CHARSET: error: invalid value 'iso-8859-1' in '-finput-charset=iso-8859-1'
 
-// RUN: %clang -### -S -fexec-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s
-// CHECK-INVALID-INPUT-CHARSET: error: invalid value 'iso-8859-1' in '-fexec-charset=iso-8859-1'
+// RUN: %clang -### -S -fexec-charset=invalid-charset -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s
+// CHECK-INVALID-INPUT-CHARSET: error: invalid value 'invalid-charset' in '-fexec-charset=invalid-charset'
+
+// Test that we support the following exec charsets.
+// RUN: %clang -### -S -fexec-charset=UTF-8 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s
+// RUN: %clang -### -S -fexec-charset=ISO8859-1 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s
+// RUN: %clang -### -S -fexec-charset=IBM-1047 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s
+// INVALID-NOT: error: invalid value
 
 // Test that we don't error on these.
 // RUN: %clang -### -S -Werror                                                \
@@ -224,7 +230,7 @@
 // RUN:     -fident -fno-ident                                                \
 // RUN:     -fimplicit-templates -fno-implicit-templates                      \
 // RUN:     -finput-charset=UTF-8                                             \
-// RUN:     -fexec-charset=UTF-8                                             \
+// RUN:     -fexec-charset=UTF-8                                              \
 // RUN:     -fivopts -fno-ivopts                                              \
 // RUN:     -fnon-call-exceptions -fno-non-call-exceptions                    \
 // RUN:     -fpermissive -fno-permissive                                      \
Index: clang/test/Driver/cl-options.c
===================================================================
--- clang/test/Driver/cl-options.c
+++ clang/test/Driver/cl-options.c
@@ -210,10 +210,11 @@
 // RUN: %clang_cl /source-charset:utf-16 -### -- %s 2>&1 | FileCheck -check-prefix=source-charset-utf-16 %s
 // source-charset-utf-16: invalid value 'utf-16' in '/source-charset:utf-16'
 
-// /execution-charset: should warn on everything except UTF-8.
-// RUN: %clang_cl /execution-charset:utf-16 -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-utf-16 %s
-// execution-charset-utf-16: invalid value 'utf-16' in '/execution-charset:utf-16'
+// /execution-charset: should warn on invalid charsets.
+// RUN: %clang_cl /execution-charset:invalid-charset -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-invalid %s
+// execution-charset-invalid: invalid value 'invalid-charset' in '/execution-charset:invalid-charset'
 //
+
 // RUN: %clang_cl /Umymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s
 // RUN: %clang_cl /U mymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s
 // U: "-U" "mymacro"
Index: clang/test/CodeGen/systemz-charset.cpp
===================================================================
--- /dev/null
+++ clang/test/CodeGen/systemz-charset.cpp
@@ -0,0 +1,46 @@
+// RUN: %clang %s -std=c++17 -emit-llvm -S -target s390x-ibm-zos -o - | FileCheck %s
+
+const char *RawString = R"(Hello\n)";
+//CHECK: c"\C8\85\93\93\96\E0\95\00"
+
+const char *MultiLineRawString = R"(
+Hello
+There)";
+//CHECK: c"\15\C8\85\93\93\96\15\E3\88\85\99\85\00"
+
+char UnicodeChar8 = u8'1';
+//CHECK: i8 49
+char16_t UnicodeChar16 = u'1';
+//CHECK: i16 49
+char32_t UnicodeChar32 = U'1';
+//CHECK: i32 49
+
+const char *EscapeCharacters8 = u8"\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: c"\07\08\0C\0A\0D\09\0B\\'\22?\00"
+
+const char16_t *EscapeCharacters16 = u"\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: [12 x i16] [i16 7, i16 8, i16 12, i16 10, i16 13, i16 9, i16 11, i16 92, i16 39, i16 34, i16 63, i16 0]
+
+const char32_t *EscapeCharacters32 = U"\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: [12 x i32] [i32 7, i32 8, i32 12, i32 10, i32 13, i32 9, i32 11, i32 92, i32 39, i32 34, i32 63, i32 0]
+
+const char *UnicodeString8 = u8"Hello";
+//CHECK: c"Hello\00"
+const char16_t *UnicodeString16 = u"Hello";
+//CHECK: [6 x i16] [i16 72, i16 101, i16 108, i16 108, i16 111, i16 0]
+const char32_t *UnicodeString32 = U"Hello";
+//CHECK: [6 x i32] [i32 72, i32 101, i32 108, i32 108, i32 111, i32 0]
+
+const char *UnicodeRawString8 = u8R"("Hello\")";
+//CHECK: c"\22Hello\\\22\00"
+const char16_t *UnicodeRawString16 = uR"("Hello\")";
+//CHECK: [9 x i16] [i16 34, i16 72, i16 101, i16 108, i16 108, i16 111, i16 92, i16 34, i16 0]
+const char32_t *UnicodeRawString32 = UR"("Hello\")";
+//CHECK: [9 x i32] [i32 34, i32 72, i32 101, i32 108, i32 108, i32 111, i32 92, i32 34, i32 0]
+
+const char *UnicodeUCNString8 = u8"\u00E2\u00AC\U000000DF";
+//CHECK: c"\C3\A2\C2\AC\C3\9F\00"
+const char16_t *UnicodeUCNString16 = u"\u00E2\u00AC\U000000DF";
+//CHECK: [4 x i16] [i16 226, i16 172, i16 223, i16 0]
+const char32_t *UnicodeUCNString32 = U"\u00E2\u00AC\U000000DF";
+//CHECK: [4 x i32] [i32 226, i32 172, i32 223, i32 0]
Index: clang/test/CodeGen/systemz-charset.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/systemz-charset.c
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s
+// RUN: %clang %s -emit-llvm -S -target s390x-ibm-zos -o - | FileCheck %s
+
+const char *UpperCaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+// CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00"
+
+const char *LowerCaseLetters = "abcdefghijklmnopqrstuvwxyz";
+//CHECK: c"\81\82\83\84\85\86\87\88\89\91\92\93\94\95\96\97\98\99\A2\A3\A4\A5\A6\A7\A8\A9\00"
+
+const char *Digits = "0123456789";
+// CHECK: c"\F0\F1\F2\F3\F4\F5\F6\F7\F8\F9\00"
+
+const char *SpecialCharacters = " .<(+|&!$*);^-/,%%_>`:#@=";
+// CHECK: c"@KLMNOPZ[\\]^_`akllmnyz{|~\00"
+
+const char *EscapeCharacters = "\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: c"/\16\0C\15\0D\05\0B\E0}\7Fo\00"
+
+const char *InvalidEscape = "\y\z";
+//CHECK: c"oo\00"
+
+const char *HexCharacters = "\x12\x13\x14";
+//CHECK: c"\12\13\14\00"
+
+const char *OctalCharacters = "\141\142\143";
+//CHECK: c"abc\00"
+
+const char singleChar = 'a';
+//CHECK: i8 -127
+
+const char *UcnCharacters = "\u00E2\u00AC\U000000DF";
+//CHECK: c"B\B0Y\00"
+
+const char *Unicode = "Ã¿";
+//CHECK: c"\DF\00"
Index: clang/lib/Lex/LiteralSupport.cpp
===================================================================
--- clang/lib/Lex/LiteralSupport.cpp
+++ clang/lib/Lex/LiteralSupport.cpp
@@ -93,7 +93,8 @@
                                   const char *ThisTokEnd, bool &HadError,
                                   FullSourceLoc Loc, unsigned CharWidth,
                                   DiagnosticsEngine *Diags,
-                                  const LangOptions &Features) {
+                                  const LangOptions &Features,
+                                  llvm::CharSetConverter *Converter) {
   const char *EscapeBegin = ThisTokBuf;
 
   // Skip the '\' char.
@@ -102,6 +103,8 @@
   // We know that this character can't be off the end of the buffer, because
   // that would have been \", which would not have been the end of string.
   unsigned ResultChar = *ThisTokBuf++;
+  bool Translate = true;
+  bool Invalid = false;
   switch (ResultChar) {
   // These map to themselves.
   case '\\': case '\'': case '"': case '?': break;
@@ -142,6 +145,7 @@
     ResultChar = 11;
     break;
   case 'x': { // Hex escape.
+    Translate = false;
     ResultChar = 0;
     if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
       if (Diags)
@@ -179,6 +183,7 @@
   case '4': case '5': case '6': case '7': {
     // Octal escapes.
     --ThisTokBuf;
+    Translate = false;
     ResultChar = 0;
 
     // Octal escapes are a series of octal digits with maximum length 3.
@@ -210,6 +215,7 @@
         << std::string(1, ResultChar);
     break;
   default:
+    Invalid = true;
     if (!Diags)
       break;
 
@@ -224,6 +230,15 @@
     break;
   }
 
+  if (Translate && Converter) {
+    // Invalid escapes are written as '?' and then translated.
+    char ByteChar = Invalid ? '?' : ResultChar;
+    SmallString<8> ResultCharConv;
+    Converter->convert(StringRef(&ByteChar), ResultCharConv);
+    assert(ResultCharConv.size() == 1 &&
+           "Char size increased after translation");
+    ResultChar = ResultCharConv[0];
+  }
   return ResultChar;
 }
 
@@ -1261,6 +1276,7 @@
   HadError = false;
 
   Kind = kind;
+  LiteralConverter *LiteralConv = &PP.getLiteralConverter();
 
   const char *TokBegin = begin;
 
@@ -1322,6 +1338,10 @@
     largest_character_for_kind = 0x7Fu;
   }
 
+  llvm::CharSetConverter *Converter = nullptr;
+  if (!isUTFLiteral(Kind) && LiteralConv)
+    Converter = LiteralConv->getConverter(ToExecCharset);
+
   while (begin != end) {
     // Is this a span of non-escape characters?
     if (begin[0] != '\\') {
@@ -1359,6 +1379,16 @@
             HadError = true;
             PP.Diag(Loc, diag::err_character_too_large);
           }
+          if (!HadError && Converter) {
+            assert(Kind != tok::wide_char_constant &&
+                   "Wide character translation not supported");
+            char ByteChar = *tmp_out_start;
+            SmallString<1> ConvertedChar;
+            Converter->convert(StringRef(&ByteChar), ConvertedChar);
+            assert(ConvertedChar.size() == 1 &&
+                   "Char size increased after translation");
+            *tmp_out_start = ConvertedChar[0];
+          }
         }
       }
 
@@ -1381,9 +1411,9 @@
     }
     unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
     uint64_t result =
-      ProcessCharEscape(TokBegin, begin, end, HadError,
-                        FullSourceLoc(Loc,PP.getSourceManager()),
-                        CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
+        ProcessCharEscape(TokBegin, begin, end, HadError,
+                          FullSourceLoc(Loc, PP.getSourceManager()), CharWidth,
+                          &PP.getDiagnostics(), PP.getLangOpts(), nullptr);
     *buffer_begin++ = result;
   }
 
@@ -1491,17 +1521,21 @@
 ///         hex-digit hex-digit hex-digit hex-digit
 /// \endverbatim
 ///
-StringLiteralParser::
-StringLiteralParser(ArrayRef<Token> StringToks,
-                    Preprocessor &PP, bool Complain)
-  : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
-    Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr),
-    MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
-    ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
-  init(StringToks);
+
+StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
+                                         Preprocessor &PP, bool Complain,
+                                         ConversionAction Action)
+    : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
+      Target(PP.getTargetInfo()),
+      Diags(Complain ? &PP.getDiagnostics() : nullptr),
+      LiteralConv(&PP.getLiteralConverter()), MaxTokenLength(0), SizeBound(0),
+      CharByteWidth(0), Kind(tok::unknown), ResultPtr(ResultBuf.data()),
+      hadError(false), Pascal(false) {
+  init(StringToks, Action);
 }
 
-void StringLiteralParser::init(ArrayRef<Token> StringToks){
+void StringLiteralParser::init(ArrayRef<Token> StringToks,
+                               ConversionAction Action) {
   // The literal token may have come from an invalid source location (e.g. due
   // to a PCH error), in which case the token length will be 0.
   if (StringToks.empty() || StringToks[0].getLength() < 2)
@@ -1577,6 +1611,10 @@
 
   SourceLocation UDSuffixTokLoc;
 
+  llvm::CharSetConverter *Converter = nullptr;
+  if (!isUTFLiteral(Kind) && LiteralConv)
+    Converter = LiteralConv->getConverter(Action);
+
   for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
     const char *ThisTokBuf = &TokenBuf[0];
     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
@@ -1684,6 +1722,16 @@
         if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
           hadError = true;
 
+        if (!hadError && Converter) {
+          assert(Kind != tok::wide_string_literal &&
+                 "Wide character translation not supported");
+          SmallString<256> CpConv;
+          int ResultLength = BeforeCRLF.size() * CharByteWidth;
+          char *Cp = ResultPtr - ResultLength;
+          Converter->convert(StringRef(Cp, ResultLength), CpConv);
+          memmove(Cp, CpConv.data(), ResultLength);
+          ResultPtr = Cp + CpConv.size();
+        }
         // Point into the \n inside the \r\n sequence and operate on the
         // remaining portion of the literal.
         RemainingTokenSpan = AfterCRLF.substr(1);
@@ -1717,25 +1765,45 @@
             ++ThisTokBuf;
           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
 
+          int Length = ThisTokBuf - InStart;
           // Copy the character span over.
           if (CopyStringFragment(StringToks[i], ThisTokBegin,
                                  StringRef(InStart, ThisTokBuf - InStart)))
             hadError = true;
+
+          if (!hadError && Converter) {
+            assert(Kind != tok::wide_string_literal &&
+                   "Wide character translation not supported");
+            SmallString<256> CpConv;
+            int ResultLength = Length * CharByteWidth;
+            char *Cp = ResultPtr - ResultLength;
+            Converter->convert(StringRef(Cp, ResultLength), CpConv);
+            memmove(Cp, CpConv.data(), ResultLength);
+            ResultPtr = Cp + CpConv.size();
+          }
           continue;
         }
         // Is this a Universal Character Name escape?
         if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
-          EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
-                          ResultPtr, hadError,
+          char *Cp = ResultPtr;
+          EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, ResultPtr,
+                          hadError,
                           FullSourceLoc(StringToks[i].getLocation(), SM),
                           CharByteWidth, Diags, Features);
+
+          if (!hadError && Converter) {
+            SmallString<8> CpConv;
+            Converter->convert(StringRef(Cp), CpConv);
+            memmove(Cp, CpConv.data(), CpConv.size());
+            ResultPtr = Cp + CpConv.size();
+          }
           continue;
         }
         // Otherwise, this is a non-UCN escape character.  Process it.
         unsigned ResultChar =
-          ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
-                            FullSourceLoc(StringToks[i].getLocation(), SM),
-                            CharByteWidth*8, Diags, Features);
+            ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
+                              FullSourceLoc(StringToks[i].getLocation(), SM),
+                              CharByteWidth * 8, Diags, Features, Converter);
 
         if (CharByteWidth == 4) {
           // FIXME: Make the type of the result buffer correct instead of
@@ -1929,8 +1997,8 @@
       ByteNo -= Len;
     } else {
       ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
-                        FullSourceLoc(Tok.getLocation(), SM),
-                        CharByteWidth*8, Diags, Features);
+                        FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
+                        Diags, Features, nullptr);
       --ByteNo;
     }
     assert(!HadError && "This method isn't valid on erroneous strings");
Index: clang/lib/Lex/LiteralConverter.cpp
===================================================================
--- /dev/null
+++ clang/lib/Lex/LiteralConverter.cpp
@@ -0,0 +1,68 @@
+//===--- LiteralConverter.cpp - Translator for String Literals -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Lex/LiteralConverter.h"
+#include "clang/Basic/DiagnosticDriver.h"
+
+using namespace llvm;
+
+llvm::CharSetConverter *LiteralConverter::getConverter(const char *Codepage) {
+  auto Iter = CharsetConverters.find(Codepage);
+  if (Iter != CharsetConverters.end())
+    return &Iter->second;
+  return nullptr;
+}
+
+llvm::CharSetConverter *
+LiteralConverter::getConverter(ConversionAction Action) {
+  StringRef CodePage;
+  if (Action == ToSystemCharset)
+    CodePage = SystemCharset;
+  else if (Action == ToExecCharset)
+    CodePage = ExecCharset;
+  else
+    CodePage = InternalCharset;
+  return getConverter(CodePage.data());
+}
+
+llvm::CharSetConverter *
+LiteralConverter::createAndInsertCharConverter(const char *To) {
+  const char *From = InternalCharset.data();
+  llvm::CharSetConverter *Converter = getConverter(To);
+  if (Converter)
+    return Converter;
+
+  ErrorOr<CharSetConverter> ErrorOrConverter =
+      llvm::CharSetConverter::create(From, To);
+  if (!ErrorOrConverter)
+    return nullptr;
+  CharsetConverters.insert_or_assign(StringRef(To),
+                                     std::move(*ErrorOrConverter));
+  return Converter;
+}
+
+void LiteralConverter::setConvertersFromOptions(
+    const clang::LangOptions &Opts, const clang::TargetInfo &TInfo,
+    clang::DiagnosticsEngine &Diags) {
+  using namespace llvm;
+  SystemCharset = TInfo.getTriple().getSystemCharset();
+  InternalCharset = "UTF-8";
+  ExecCharset = Opts.ExecCharset.empty() ? InternalCharset : Opts.ExecCharset;
+  // Create converter between internal and system charset
+  if (!InternalCharset.equals(SystemCharset))
+    createAndInsertCharConverter(SystemCharset.data());
+
+  // Create converter between internal and exec charset specified
+  // in fexec-charset option.
+  if (InternalCharset.equals(ExecCharset))
+    return;
+  if (!createAndInsertCharConverter(ExecCharset.data())) {
+    Diags.Report(clang::diag::err_drv_invalid_value)
+        << "-fexec-charset" << ExecCharset;
+  }
+}
Index: clang/lib/Lex/CMakeLists.txt
===================================================================
--- clang/lib/Lex/CMakeLists.txt
+++ clang/lib/Lex/CMakeLists.txt
@@ -7,6 +7,7 @@
   HeaderMap.cpp
   HeaderSearch.cpp
   Lexer.cpp
+  LiteralConverter.cpp
   LiteralSupport.cpp
   MacroArgs.cpp
   MacroInfo.cpp
Index: clang/lib/Frontend/CompilerInstance.cpp
===================================================================
--- clang/lib/Frontend/CompilerInstance.cpp
+++ clang/lib/Frontend/CompilerInstance.cpp
@@ -12,6 +12,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticDriver.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/LangStandard.h"
 #include "clang/Basic/SourceManager.h"
@@ -29,6 +30,7 @@
 #include "clang/Frontend/Utils.h"
 #include "clang/Frontend/VerifyDiagnosticConsumer.h"
 #include "clang/Lex/HeaderSearch.h"
+#include "clang/Lex/LiteralConverter.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Sema/CodeCompleteConsumer.h"
@@ -529,6 +531,8 @@
                            /*ShowAllHeaders=*/true, /*OutputPath=*/"",
                            /*ShowDepth=*/true, /*MSStyle=*/true);
   }
+  PP->getLiteralConverter().setConvertersFromOptions(getLangOpts(), getTarget(),
+                                                     getDiagnostics());
 }
 
 std::string CompilerInstance::getSpecificModuleCachePath(StringRef ModuleHash) {
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -36,6 +36,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Option/ArgList.h"
+#include "llvm/Support/CharSet.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Compression.h"
@@ -6218,14 +6219,21 @@
                                           << value;
   }
 
-  // -fexec_charset=UTF-8 is default. Reject others
+  // Set the default fexec-charset as the system charset.
+  CmdArgs.push_back("-fexec-charset");
+  CmdArgs.push_back(Args.MakeArgString(Triple.getSystemCharset()));
   if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
     StringRef value = execCharset->getValue();
-    if (!value.equals_lower("utf-8"))
-      D.Diag(diag::err_drv_invalid_value) << execCharset->getAsString(Args)
-                                          << value;
+    llvm::ErrorOr<llvm::CharSetConverter> ErrorOrConverter =
+        llvm::CharSetConverter::create("UTF-8", value.data());
+    if (ErrorOrConverter) {
+      CmdArgs.push_back("-fexec-charset");
+      CmdArgs.push_back(Args.MakeArgString(value));
+    } else {
+      D.Diag(diag::err_drv_invalid_value)
+          << execCharset->getAsString(Args) << value;
+    }
   }
-
   RenderDiagnosticsOptions(D, Args, CmdArgs);
 
   // -fno-asm-blocks is default.
Index: clang/include/clang/Lex/Preprocessor.h
===================================================================
--- clang/include/clang/Lex/Preprocessor.h
+++ clang/include/clang/Lex/Preprocessor.h
@@ -23,6 +23,7 @@
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TokenKinds.h"
 #include "clang/Lex/Lexer.h"
+#include "clang/Lex/LiteralConverter.h"
 #include "clang/Lex/MacroInfo.h"
 #include "clang/Lex/ModuleLoader.h"
 #include "clang/Lex/ModuleMap.h"
@@ -141,6 +142,7 @@
   std::unique_ptr<ScratchBuffer> ScratchBuf;
   HeaderSearch      &HeaderInfo;
   ModuleLoader      &TheModuleLoader;
+  LiteralConverter LiteralConv;
 
   /// External source of macros.
   ExternalPreprocessorSource *ExternalSource;
@@ -931,6 +933,7 @@
   SelectorTable &getSelectorTable() { return Selectors; }
   Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; }
   llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; }
+  LiteralConverter &getLiteralConverter() { return LiteralConv; }
 
   void setExternalSource(ExternalPreprocessorSource *Source) {
     ExternalSource = Source;
Index: clang/include/clang/Lex/LiteralSupport.h
===================================================================
--- clang/include/clang/Lex/LiteralSupport.h
+++ clang/include/clang/Lex/LiteralSupport.h
@@ -17,10 +17,12 @@
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/LiteralConverter.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CharSet.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace clang {
@@ -185,9 +187,8 @@
   SmallString<32> UDSuffixBuf;
   unsigned UDSuffixOffset;
 public:
-  CharLiteralParser(const char *begin, const char *end,
-                    SourceLocation Loc, Preprocessor &PP,
-                    tok::TokenKind kind);
+  CharLiteralParser(const char *begin, const char *end, SourceLocation Loc,
+                    Preprocessor &PP, tok::TokenKind kind);
 
   bool hadError() const { return HadError; }
   bool isAscii() const { return Kind == tok::char_constant; }
@@ -212,6 +213,7 @@
   const LangOptions &Features;
   const TargetInfo &Target;
   DiagnosticsEngine *Diags;
+  LiteralConverter *LiteralConv;
 
   unsigned MaxTokenLength;
   unsigned SizeBound;
@@ -223,19 +225,19 @@
   unsigned UDSuffixToken;
   unsigned UDSuffixOffset;
 public:
-  StringLiteralParser(ArrayRef<Token> StringToks,
-                      Preprocessor &PP, bool Complain = true);
-  StringLiteralParser(ArrayRef<Token> StringToks,
-                      const SourceManager &sm, const LangOptions &features,
-                      const TargetInfo &target,
+  StringLiteralParser(ArrayRef<Token> StringToks, Preprocessor &PP,
+                      bool Complain = true,
+                      ConversionAction Action = ToExecCharset);
+  StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm,
+                      const LangOptions &features, const TargetInfo &target,
                       DiagnosticsEngine *diags = nullptr)
-    : SM(sm), Features(features), Target(target), Diags(diags),
-      MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
-      ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
-    init(StringToks);
+      : SM(sm), Features(features), Target(target), Diags(diags),
+        LiteralConv(nullptr), MaxTokenLength(0), SizeBound(0), CharByteWidth(0),
+        Kind(tok::unknown), ResultPtr(ResultBuf.data()), hadError(false),
+        Pascal(false) {
+    init(StringToks, NoConversion);
   }
 
-
   bool hadError;
   bool Pascal;
 
@@ -278,7 +280,7 @@
   static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix);
 
 private:
-  void init(ArrayRef<Token> StringToks);
+  void init(ArrayRef<Token> StringToks, ConversionAction Action);
   bool CopyStringFragment(const Token &Tok, const char *TokBegin,
                           StringRef Fragment);
   void DiagnoseLexingError(SourceLocation Loc);
Index: clang/include/clang/Lex/LiteralConverter.h
===================================================================
--- /dev/null
+++ clang/include/clang/Lex/LiteralConverter.h
@@ -0,0 +1,36 @@
+//===--- clang/Lex/LiteralConverter.h - Translator for Literals -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LEX_LITERALCONVERTER_H
+#define LLVM_CLANG_LEX_LITERALCONVERTER_H
+
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TargetInfo.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CharSet.h"
+
+enum ConversionAction { NoConversion, ToSystemCharset, ToExecCharset };
+
+class LiteralConverter {
+  llvm::StringRef InternalCharset;
+  llvm::StringRef SystemCharset;
+  llvm::StringRef ExecCharset;
+  llvm::StringMap<llvm::CharSetConverter> CharsetConverters;
+
+public:
+  llvm::CharSetConverter *getConverter(const char *Codepage);
+  llvm::CharSetConverter *getConverter(ConversionAction Action);
+  llvm::CharSetConverter *createAndInsertCharConverter(const char *To);
+  void setConvertersFromOptions(const clang::LangOptions &Opts,
+                                const clang::TargetInfo &TInfo,
+                                clang::DiagnosticsEngine &Diags);
+};
+
+#endif
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -4424,6 +4424,11 @@
 
 let Flags = [CC1Option, CC1AsOption, NoDriverOption] in {
 
+def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<charset>">,
+  HelpText<"Set the execution <charset> for string and character literals. "
+           "Supported character encodings include ISO8859-1, UTF-8, IBM-1047 "
+           "and those supported by the host iconv library.">,
+  MarshallingInfoString<LangOpts<"ExecCharset">>;
 def target_cpu : Separate<["-"], "target-cpu">,
   HelpText<"Target a specific cpu type">,
   MarshallingInfoString<TargetOpts<"CPU">>;
Index: clang/include/clang/Basic/TokenKinds.h
===================================================================
--- clang/include/clang/Basic/TokenKinds.h
+++ clang/include/clang/Basic/TokenKinds.h
@@ -90,6 +90,13 @@
          isStringLiteral(K) || K == tok::header_name;
 }
 
+/// Return true if this is a utf literal kind.
+inline bool isUTFLiteral(TokenKind K) {
+  return K == tok::utf8_char_constant || K == tok::utf8_string_literal ||
+         K == tok::utf16_char_constant || K == tok::utf16_string_literal ||
+         K == tok::utf32_char_constant || K == tok::utf32_string_literal;
+}
+
 /// Return true if this is any of tok::annot_* kinds.
 bool isAnnotation(TokenKind K);
 
Index: clang/include/clang/Basic/LangOptions.h
===================================================================
--- clang/include/clang/Basic/LangOptions.h
+++ clang/include/clang/Basic/LangOptions.h
@@ -342,6 +342,9 @@
   /// input is a header file (i.e. -x c-header).
   bool IsHeaderFile = false;
 
+  /// Name of the exec charset to convert the internal charset to.
+  std::string ExecCharset;
+
   LangOptions();
 
   // Define accessors/mutators for language options of enumeration type.

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D93031: Enable fexec-charset option

Reply via email to