Author: bruno Date: Tue Mar 29 12:35:02 2016 New Revision: 264752 URL: http://llvm.org/viewvc/llvm-project?rev=264752&view=rev Log: [Sema] Handle UTF-8 invalid format string specifiers
Improve invalid format string specifier handling by printing out invalid specifiers characters with \x, \u and \U. Previously clang would print gargabe whenever the character is unprintable. Example, before: NSLog(@"%\u25B9"); => warning: invalid conversion specifier ' [-Wformat-invalid-specifier] after: NSLog(@"%\u25B9"); => warning: invalid conversion specifier '\u25b9' [-Wformat-invalid-specifier] Differential Revision: http://reviews.llvm.org/D18296 rdar://problem/24672159 Modified: cfe/trunk/include/clang/Analysis/Analyses/FormatString.h cfe/trunk/lib/Analysis/FormatString.cpp cfe/trunk/lib/Analysis/FormatStringParsing.h cfe/trunk/lib/Analysis/PrintfFormatString.cpp cfe/trunk/lib/Analysis/ScanfFormatString.cpp cfe/trunk/lib/Sema/SemaChecking.cpp cfe/trunk/test/Sema/format-strings-scanf.c cfe/trunk/test/Sema/format-strings.c cfe/trunk/test/SemaObjC/format-strings-objc.m Modified: cfe/trunk/include/clang/Analysis/Analyses/FormatString.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Analysis/Analyses/FormatString.h?rev=264752&r1=264751&r2=264752&view=diff ============================================================================== --- cfe/trunk/include/clang/Analysis/Analyses/FormatString.h (original) +++ cfe/trunk/include/clang/Analysis/Analyses/FormatString.h Tue Mar 29 12:35:02 2016 @@ -210,6 +210,7 @@ public: unsigned getLength() const { return EndScanList ? EndScanList - Position : 1; } + void setEndScanList(const char *pos) { EndScanList = pos; } bool isIntArg() const { return (kind >= IntArgBeg && kind <= IntArgEnd) || kind == FreeBSDrArg || kind == FreeBSDyArg; } @@ -413,11 +414,6 @@ public: bool isObjCArg() const { return kind >= ObjCBeg && kind <= ObjCEnd; } bool isDoubleArg() const { return kind >= DoubleArgBeg && kind <= DoubleArgEnd; } - unsigned getLength() const { - // Conversion specifiers currently only are represented by - // single characters, but we be flexible. - return 1; - } static bool classof(const analyze_format_string::ConversionSpecifier *CS) { return CS->isPrintfKind(); @@ -546,8 +542,6 @@ public: ScanfConversionSpecifier(const char *pos, Kind k) : ConversionSpecifier(false, pos, k) {} - void setEndScanList(const char *pos) { EndScanList = pos; } - static bool classof(const analyze_format_string::ConversionSpecifier *CS) { return !CS->isPrintfKind(); } Modified: cfe/trunk/lib/Analysis/FormatString.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Analysis/FormatString.cpp?rev=264752&r1=264751&r2=264752&view=diff ============================================================================== --- cfe/trunk/lib/Analysis/FormatString.cpp (original) +++ cfe/trunk/lib/Analysis/FormatString.cpp Tue Mar 29 12:35:02 2016 @@ -15,6 +15,7 @@ #include "FormatStringParsing.h" #include "clang/Basic/LangOptions.h" #include "clang/Basic/TargetInfo.h" +#include "llvm/Support/ConvertUTF.h" using clang::analyze_format_string::ArgType; using clang::analyze_format_string::FormatStringHandler; @@ -260,6 +261,28 @@ clang::analyze_format_string::ParseLengt return true; } +bool clang::analyze_format_string::ParseUTF8InvalidSpecifier( + const char *SpecifierBegin, const char *FmtStrEnd, unsigned &Len) { + if (SpecifierBegin + 1 >= FmtStrEnd) + return false; + + const UTF8 *SB = reinterpret_cast<const UTF8 *>(SpecifierBegin + 1); + const UTF8 *SE = reinterpret_cast<const UTF8 *>(FmtStrEnd); + const char FirstByte = *SB; + + // If the invalid specifier is a multibyte UTF-8 string, return the + // total length accordingly so that the conversion specifier can be + // properly updated to reflect a complete UTF-8 specifier. + unsigned NumBytes = getNumBytesForUTF8(FirstByte); + if (NumBytes == 1) + return false; + if (SB + NumBytes > SE) + return false; + + Len = NumBytes + 1; + return true; +} + //===----------------------------------------------------------------------===// // Methods on ArgType. //===----------------------------------------------------------------------===// Modified: cfe/trunk/lib/Analysis/FormatStringParsing.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Analysis/FormatStringParsing.h?rev=264752&r1=264751&r2=264752&view=diff ============================================================================== --- cfe/trunk/lib/Analysis/FormatStringParsing.h (original) +++ cfe/trunk/lib/Analysis/FormatStringParsing.h Tue Mar 29 12:35:02 2016 @@ -46,7 +46,13 @@ bool ParseArgPosition(FormatStringHandle /// FormatSpecifier& argument, and false otherwise. bool ParseLengthModifier(FormatSpecifier &FS, const char *&Beg, const char *E, const LangOptions &LO, bool IsScanf = false); - + +/// Returns true if the invalid specifier in \p SpecifierBegin is a UTF-8 +/// string; check that it won't go further than \p FmtStrEnd and write +/// up the total size in \p Len. +bool ParseUTF8InvalidSpecifier(const char *SpecifierBegin, + const char *FmtStrEnd, unsigned &Len); + template <typename T> class SpecifierResult { T FS; const char *Start; Modified: cfe/trunk/lib/Analysis/PrintfFormatString.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Analysis/PrintfFormatString.cpp?rev=264752&r1=264751&r2=264752&view=diff ============================================================================== --- cfe/trunk/lib/Analysis/PrintfFormatString.cpp (original) +++ cfe/trunk/lib/Analysis/PrintfFormatString.cpp Tue Mar 29 12:35:02 2016 @@ -312,8 +312,13 @@ static PrintfSpecifierResult ParsePrintf argIndex++; if (k == ConversionSpecifier::InvalidSpecifier) { + unsigned Len = I - Start; + if (ParseUTF8InvalidSpecifier(Start, E, Len)) { + CS.setEndScanList(Start + Len); + FS.setConversionSpecifier(CS); + } // Assume the conversion takes one argument. - return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, I - Start); + return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, Len); } return PrintfSpecifierResult(Start, FS); } Modified: cfe/trunk/lib/Analysis/ScanfFormatString.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Analysis/ScanfFormatString.cpp?rev=264752&r1=264751&r2=264752&view=diff ============================================================================== --- cfe/trunk/lib/Analysis/ScanfFormatString.cpp (original) +++ cfe/trunk/lib/Analysis/ScanfFormatString.cpp Tue Mar 29 12:35:02 2016 @@ -79,7 +79,7 @@ static ScanfSpecifierResult ParseScanfSp unsigned &argIndex, const LangOptions &LO, const TargetInfo &Target) { - + using namespace clang::analyze_format_string; using namespace clang::analyze_scanf; const char *I = Beg; const char *Start = nullptr; @@ -210,10 +210,15 @@ static ScanfSpecifierResult ParseScanfSp // FIXME: '%' and '*' doesn't make sense. Issue a warning. // FIXME: 'ConsumedSoFar' and '*' doesn't make sense. - + if (k == ScanfConversionSpecifier::InvalidSpecifier) { + unsigned Len = I - Beg; + if (ParseUTF8InvalidSpecifier(Beg, E, Len)) { + CS.setEndScanList(Beg + Len); + FS.setConversionSpecifier(CS); + } // Assume the conversion takes one argument. - return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, I - Beg); + return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, Len); } return ScanfSpecifierResult(Start, FS); } Modified: cfe/trunk/lib/Sema/SemaChecking.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaChecking.cpp?rev=264752&r1=264751&r2=264752&view=diff ============================================================================== --- cfe/trunk/lib/Sema/SemaChecking.cpp (original) +++ cfe/trunk/lib/Sema/SemaChecking.cpp Tue Mar 29 12:35:02 2016 @@ -36,6 +36,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallString.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/Locale.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/raw_ostream.h" #include <limits> @@ -3976,12 +3978,41 @@ CheckFormatHandler::HandleInvalidConvers // gibberish when trying to match arguments. keepGoing = false; } - - EmitFormatDiagnostic(S.PDiag(diag::warn_format_invalid_conversion) - << StringRef(csStart, csLen), - Loc, /*IsStringLocation*/true, - getSpecifierRange(startSpec, specifierLen)); - + + StringRef Specifier(csStart, csLen); + + // If the specifier in non-printable, it could be the first byte of a UTF-8 + // sequence. In that case, print the UTF-8 code point. If not, print the byte + // hex value. + std::string CodePointStr; + if (!llvm::sys::locale::isPrint(*csStart)) { + UTF32 CodePoint; + const UTF8 **B = reinterpret_cast<const UTF8 **>(&csStart); + const UTF8 *E = + reinterpret_cast<const UTF8 *>(csStart + csLen); + ConversionResult Result = + llvm::convertUTF8Sequence(B, E, &CodePoint, strictConversion); + + if (Result != conversionOK) { + unsigned char FirstChar = *csStart; + CodePoint = (UTF32)FirstChar; + } + + llvm::raw_string_ostream OS(CodePointStr); + if (CodePoint < 256) + OS << "\\x" << llvm::format("%02x", CodePoint); + else if (CodePoint <= 0xFFFF) + OS << "\\u" << llvm::format("%04x", CodePoint); + else + OS << "\\U" << llvm::format("%08x", CodePoint); + OS.flush(); + Specifier = CodePointStr; + } + + EmitFormatDiagnostic( + S.PDiag(diag::warn_format_invalid_conversion) << Specifier, Loc, + /*IsStringLocation*/ true, getSpecifierRange(startSpec, specifierLen)); + return keepGoing; } Modified: cfe/trunk/test/Sema/format-strings-scanf.c URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Sema/format-strings-scanf.c?rev=264752&r1=264751&r2=264752&view=diff ============================================================================== --- cfe/trunk/test/Sema/format-strings-scanf.c (original) +++ cfe/trunk/test/Sema/format-strings-scanf.c Tue Mar 29 12:35:02 2016 @@ -183,3 +183,11 @@ void check_conditional_literal(char *s, scanf(i ? "%d" : "%d", i, s); // expected-warning{{data argument not used}} scanf(i ? "%s" : "%d", s); // expected-warning{{format specifies type 'int *'}} } + +void testInvalidNoPrintable(int *a) { + scanf("%\u25B9", a); // expected-warning {{invalid conversion specifier '\u25b9'}} + scanf("%\xE2\x96\xB9", a); // expected-warning {{invalid conversion specifier '\u25b9'}} + scanf("%\U00010348", a); // expected-warning {{invalid conversion specifier '\U00010348'}} + scanf("%\xF0\x90\x8D\x88", a); // expected-warning {{invalid conversion specifier '\U00010348'}} + scanf("%\xe2", a); // expected-warning {{invalid conversion specifier '\xe2'}} +} Modified: cfe/trunk/test/Sema/format-strings.c URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Sema/format-strings.c?rev=264752&r1=264751&r2=264752&view=diff ============================================================================== --- cfe/trunk/test/Sema/format-strings.c (original) +++ cfe/trunk/test/Sema/format-strings.c Tue Mar 29 12:35:02 2016 @@ -642,6 +642,14 @@ void test_qualifiers(volatile int *vip, printf("%n", (cip_t)0); // expected-warning{{format specifies type 'int *' but the argument has type 'cip_t' (aka 'const int *')}} } +void testInvalidNoPrintable() { + printf("%\u25B9"); // expected-warning {{invalid conversion specifier '\u25b9'}} + printf("%\xE2\x96\xB9"); // expected-warning {{invalid conversion specifier '\u25b9'}} + printf("%\U00010348"); // expected-warning {{invalid conversion specifier '\U00010348'}} + printf("%\xF0\x90\x8D\x88"); // expected-warning {{invalid conversion specifier '\U00010348'}} + printf("%\xe2"); // expected-warning {{invalid conversion specifier '\xe2'}} +} + #pragma GCC diagnostic ignored "-Wformat-nonliteral" #pragma GCC diagnostic warning "-Wformat-security" // <rdar://problem/14178260> Modified: cfe/trunk/test/SemaObjC/format-strings-objc.m URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/SemaObjC/format-strings-objc.m?rev=264752&r1=264751&r2=264752&view=diff ============================================================================== --- cfe/trunk/test/SemaObjC/format-strings-objc.m (original) +++ cfe/trunk/test/SemaObjC/format-strings-objc.m Tue Mar 29 12:35:02 2016 @@ -265,3 +265,11 @@ void testObjCModifierFlags() { NSLog(@"%2$[tt]@ %1$[tt]s", @"Foo", @"Bar"); // expected-warning {{object format flags cannot be used with 's' conversion specifier}} } +// Test Objective-C invalid no printable specifiers +void testObjcInvalidNoPrintable(int *a) { + NSLog(@"%\u25B9"); // expected-warning {{invalid conversion specifier '\u25b9'}} + NSLog(@"%\xE2\x96\xB9"); // expected-warning {{invalid conversion specifier '\u25b9'}} + NSLog(@"%\U00010348"); // expected-warning {{invalid conversion specifier '\U00010348'}} + NSLog(@"%\xF0\x90\x8D\x88"); // expected-warning {{invalid conversion specifier '\U00010348'}} + NSLog(@"%\xe2"); // expected-warning {{input conversion stopped}} expected-warning {{invalid conversion specifier '\xe2'}} +} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits