================ @@ -0,0 +1,295 @@ +//=== MissingTerminatingZeroChecker.cpp -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Check for string arguments passed to C library functions where the +// terminating zero is missing. +// +//===----------------------------------------------------------------------===// + +#include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h" +#include "clang/StaticAnalyzer/Core/Checker.h" +#include "clang/StaticAnalyzer/Core/CheckerManager.h" +#include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h" +#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h" +#include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h" +#include "clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h" +#include "llvm/ADT/BitVector.h" +#include <sstream> + +using namespace clang; +using namespace ento; + +namespace { + +struct StringData { + const MemRegion *StrRegion; + int64_t StrLength; + unsigned int Offset; + const llvm::BitVector *NonNullData; +}; + +class MissingTerminatingZeroChecker + : public Checker<check::Bind, check::PreCall> { +public: + void checkBind(SVal L, SVal V, const Stmt *S, CheckerContext &C) const; + void checkPreCall(const CallEvent &Call, CheckerContext &C) const; + + void initOptions(bool NoDefaultIgnore, StringRef IgnoreList); + +private: + const BugType BT{this, "Missing terminating zero"}; + + using IgnoreEntry = std::pair<int, int>; + /// Functions (identified by name only) to ignore. + /// The entry stores a parameter index, or -1. + llvm::StringMap<IgnoreEntry> FunctionsToIgnore = { + {"stpncpy", {1, -1}}, {"strncat", {1, -1}}, {"strncmp", {0, 1}}, + {"strncpy", {1, -1}}, {"strndup", {0, -1}}, {"strnlen", {0, -1}}, + }; + + bool checkArg(unsigned int ArgI, CheckerContext &C, + const CallEvent &Call) const; + bool getStringData(StringData &DataOut, ProgramStateRef State, + SValBuilder &SVB, const MemRegion *StrReg) const; + ProgramStateRef setStringData(ProgramStateRef State, Loc L, + const llvm::BitVector &NonNullData) const; + void reportBug(ExplodedNode *N, const Expr *E, CheckerContext &C, + const char Msg[]) const; +}; + +} // namespace + +namespace llvm { +template <> struct FoldingSetTrait<llvm::BitVector> { + static inline void Profile(llvm::BitVector X, FoldingSetNodeID &ID) { + ID.AddInteger(X.size()); + for (unsigned int I = 0; I < X.size(); ++I) + ID.AddBoolean(X[I]); + } +}; +} // end namespace llvm + +// Contains for a "string" (character array) region if elements are known to be +// non-zero. The bit vector is indexed by the array element index and is true +// if the element is known to be non-zero. Size of the vector does not +// correspond to the extent of the memory region (can be smaller), the missing +// elements are considered to be false. +// (A value of 'false' means that the string element is zero or unknown.) +REGISTER_MAP_WITH_PROGRAMSTATE(NonNullnessData, const MemRegion *, + llvm::BitVector) ---------------- steakhal wrote:
> > If I remember my first idea was to only check the affected (element) > > regions and it would be a simple checker. But somehow I did not find a way > > to iterate over the array elements (or subregions). > > I also recall that this limitation exists, but it could and should be > addressed by improving the interface of `RegionStore`. There is an API, it's called `iterBindings`, but I guess it's not suitable for you because it only iterates the direct bindings - and skips the default binding (if any). It's not commonly used, and there are even fever good use-cases for its existence. My second though to complement this weakness of `iterBindings` is using the `getDefaultBinding` api. Using these two could work. > (By the way @steakhal do you have any plans/ideas about modeling strcpy / > memcpy etc.? How difficult would it be to model them with a default binding > that places a suitable LazyCompoundVal?) I don't have any plans touching `strcpy` or `memcpy` modeling. And for the second question, copy operations are usually modeled by LCVs or if the subject is small, then a direct copy of each bindings. There is one Achilles heel of the RegionStore. Copying an object uses LCVs, which can only be bound as a default binding, which doesn't have an extent. It's correct that LCVs are default bindings. The problem is that default bindings doesn't have an extent, aka. end/length. There is a lengthy discussion of the proposal that we at Sonar believe would work, and we actually have a really early draft prototype of what I described there. Unfortunately, we never got back to finishing it. So in short, modeling `memcpy` works well as long as you copy the entire object. And it breaks under any other circumstances. > > Or would it be better to "get" (or create?) an element region for all > > elements of the string when for example a `strlen` is called? > > Yes, this is the standard approach, so it would be better (unless there are > unavoidable issues with it). Obviously we must introduce a practical limit > (32? 128? 256?) for the number of checked elements -- to avoid performance > issues -- but otherwise this should work. I'd be very careful of doing this. Asking for an ElementRegion will inherently allocate, and do hashtable lookups (to insert it). And how often is it the case that a string is made of individual byte manipulation instead of using a string literal (string literal region) or concatenation function (likely some conjured) or an array initializer expression (these are represented by CompoundVals). My guess is that it's fairly uncommon, but prove me wrong with data. > [...] Alternatively, if querying all elements of a string separately is too > slow, then we could extend the interface of RegionStore with "bulk query" > methods that can return knowledge about a larger memory area (e.g. string or > other array) without splitting it into separate elements. (The bulk query > should be able to combine multiple smaller separate bindings, but should be > efficient for the common case when the whole region is initialized from e.g. > a string literal.) I think I already covered this point at the beginning of this reply. https://github.com/llvm/llvm-project/pull/146664 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits