Author: sammccall Date: Fri Jul 20 01:01:37 2018 New Revision: 337527 URL: http://llvm.org/viewvc/llvm-project?rev=337527&view=rev Log: [clangd] FuzzyMatch exposes an API for its word segmentation. NFC
Summary: This is intended to be used for indexing, e.g. in D49417 Reviewers: ioeric, omtcyfz Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, cfe-commits Differential Revision: https://reviews.llvm.org/D49540 Modified: clang-tools-extra/trunk/clangd/FuzzyMatch.cpp clang-tools-extra/trunk/clangd/FuzzyMatch.h clang-tools-extra/trunk/unittests/clangd/FuzzyMatchTests.cpp Modified: clang-tools-extra/trunk/clangd/FuzzyMatch.cpp URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/FuzzyMatch.cpp?rev=337527&r1=337526&r2=337527&view=diff ============================================================================== --- clang-tools-extra/trunk/clangd/FuzzyMatch.cpp (original) +++ clang-tools-extra/trunk/clangd/FuzzyMatch.cpp Fri Jul 20 01:01:37 2018 @@ -87,8 +87,8 @@ FuzzyMatcher::FuzzyMatcher(StringRef Pat for (int W = 0; W < P; ++W) for (Action A : {Miss, Match}) Scores[P][W][A] = {AwfulScore, Miss}; - if (PatN > 0) - calculateRoles(Pat, PatRole, PatTypeSet, PatN); + PatTypeSet = + calculateRoles(StringRef(Pat, PatN), makeMutableArrayRef(PatRole, PatN)); } Optional<float> FuzzyMatcher::match(StringRef Word) { @@ -110,25 +110,6 @@ Optional<float> FuzzyMatcher::match(Stri return Score; } -// Segmentation of words and patterns. -// A name like "fooBar_baz" consists of several parts foo, bar, baz. -// Aligning segmentation of word and pattern improves the fuzzy-match. -// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation" -// -// First we classify each character into types (uppercase, lowercase, etc). -// Then we look at the sequence: e.g. [upper, lower] is the start of a segment. - -// We only distinguish the types of characters that affect segmentation. -// It's not obvious how to segment digits, we treat them as lowercase letters. -// As we don't decode UTF-8, we treat bytes over 127 as lowercase too. -// This means we require exact (case-sensitive) match. -enum FuzzyMatcher::CharType : unsigned char { - Empty = 0, // Before-the-start and after-the-end (and control chars). - Lower = 1, // Lowercase letters, digits, and non-ASCII bytes. - Upper = 2, // Uppercase letters. - Punctuation = 3, // ASCII punctuation (including Space) -}; - // We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte. // The top 6 bits of the char select the byte, the bottom 2 select the offset. // e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower. @@ -147,17 +128,6 @@ constexpr static uint8_t CharTypes[] = { 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, }; -// Each character's Role is the Head or Tail of a segment, or a Separator. -// e.g. XMLHttpRequest_Async -// +--+---+------ +---- -// ^Head ^Tail ^Separator -enum FuzzyMatcher::CharRole : unsigned char { - Unknown = 0, // Stray control characters or impossible states. - Tail = 1, // Part of a word segment, but not the first character. - Head = 2, // The first character of a word segment. - Separator = 3, // Punctuation characters that separate word segments. -}; - // The Role can be determined from the Type of a character and its neighbors: // // Example | Chars | Type | Role @@ -183,26 +153,28 @@ constexpr static uint8_t CharRoles[] = { template <typename T> static T packedLookup(const uint8_t *Data, int I) { return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3); } -void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int &TypeSet, - int N) { - assert(N > 0); +CharTypeSet calculateRoles(StringRef Text, MutableArrayRef<CharRole> Roles) { + assert(Text.size() == Roles.size()); + if (Text.size() == 0) + return 0; CharType Type = packedLookup<CharType>(CharTypes, Text[0]); - TypeSet = 1 << Type; + CharTypeSet TypeSet = 1 << Type; // Types holds a sliding window of (Prev, Curr, Next) types. // Initial value is (Empty, Empty, type of Text[0]). int Types = Type; // Rotate slides in the type of the next character. auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; }; - for (int I = 0; I < N - 1; ++I) { + for (unsigned I = 0; I < Text.size() - 1; ++I) { // For each character, rotate in the next, and look up the role. Type = packedLookup<CharType>(CharTypes, Text[I + 1]); TypeSet |= 1 << Type; Rotate(Type); - *Out++ = packedLookup<CharRole>(CharRoles, Types); + Roles[I] = packedLookup<CharRole>(CharRoles, Types); } // For the last character, the "next character" is Empty. Rotate(Empty); - *Out++ = packedLookup<CharRole>(CharRoles, Types); + Roles[Text.size() - 1] = packedLookup<CharRole>(CharRoles, Types); + return TypeSet; } // Sets up the data structures matching Word. @@ -228,7 +200,8 @@ bool FuzzyMatcher::init(StringRef NewWor // FIXME: some words are hard to tokenize algorithmically. // e.g. vsprintf is V S Print F, and should match [pri] but not [int]. // We could add a tokenization dictionary for common stdlib names. - calculateRoles(Word, WordRole, WordTypeSet, WordN); + WordTypeSet = calculateRoles(StringRef(Word, WordN), + makeMutableArrayRef(WordRole, WordN)); return true; } Modified: clang-tools-extra/trunk/clangd/FuzzyMatch.h URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/FuzzyMatch.h?rev=337527&r1=337526&r2=337527&view=diff ============================================================================== --- clang-tools-extra/trunk/clangd/FuzzyMatch.h (original) +++ clang-tools-extra/trunk/clangd/FuzzyMatch.h Fri Jul 20 01:01:37 2018 @@ -16,6 +16,7 @@ #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" @@ -24,6 +25,48 @@ namespace clang { namespace clangd { +// Utilities for word segmentation. +// FuzzyMatcher already incorporates this logic, so most users don't need this. +// +// A name like "fooBar_baz" consists of several parts foo, bar, baz. +// Aligning segmentation of word and pattern improves the fuzzy-match. +// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation" +// +// First we classify each character into types (uppercase, lowercase, etc). +// Then we look at the sequence: e.g. [upper, lower] is the start of a segment. + +// We distinguish the types of characters that affect segmentation. +// It's not obvious how to segment digits, we treat them as lowercase letters. +// As we don't decode UTF-8, we treat bytes over 127 as lowercase too. +// This means we require exact (case-sensitive) match for those characters. +enum CharType : unsigned char { + Empty = 0, // Before-the-start and after-the-end (and control chars). + Lower = 1, // Lowercase letters, digits, and non-ASCII bytes. + Upper = 2, // Uppercase letters. + Punctuation = 3, // ASCII punctuation (including Space) +}; +// A CharTypeSet is a bitfield representing all the character types in a word. +// Its bits are 1<<Empty, 1<<Lower, etc. +using CharTypeSet = unsigned char; + +// Each character's Role is the Head or Tail of a segment, or a Separator. +// e.g. XMLHttpRequest_Async +// +--+---+------ +---- +// ^Head ^Tail ^Separator +enum CharRole : unsigned char { + Unknown = 0, // Stray control characters or impossible states. + Tail = 1, // Part of a word segment, but not the first character. + Head = 2, // The first character of a word segment. + Separator = 3, // Punctuation characters that separate word segments. +}; + +// Compute segmentation of Text. +// Character roles are stored in Roles (Roles.size() must equal Text.size()). +// The set of character types encountered is returned, this may inform +// heuristics for dealing with poorly-segmented identifiers like "strndup". +CharTypeSet calculateRoles(llvm::StringRef Text, + llvm::MutableArrayRef<CharRole> Roles); + // A matcher capable of matching and scoring strings against a single pattern. // It's optimized for matching against many strings - match() does not allocate. class FuzzyMatcher { @@ -48,8 +91,6 @@ public: private: // We truncate the pattern and the word to bound the cost of matching. constexpr static int MaxPat = 63, MaxWord = 127; - enum CharRole : unsigned char; // For segmentation. - enum CharType : unsigned char; // For segmentation. // Action describes how a word character was matched to the pattern. // It should be an enum, but this causes bitfield problems: // - for MSVC the enum type must be explicitly unsigned for correctness @@ -60,7 +101,6 @@ private: bool init(llvm::StringRef Word); void buildGraph(); - void calculateRoles(const char *Text, CharRole *Out, int &Types, int N); bool allowMatch(int P, int W, Action Last) const; int skipPenalty(int W, Action Last) const; int matchBonus(int P, int W, Action Last) const; @@ -70,7 +110,7 @@ private: int PatN; // Length char LowPat[MaxPat]; // Pattern in lowercase CharRole PatRole[MaxPat]; // Pattern segmentation info - int PatTypeSet; // Bitmask of 1<<CharType for all Pattern characters + CharTypeSet PatTypeSet; // Bitmask of 1<<CharType for all Pattern characters float ScoreScale; // Normalizes scores for the pattern length. // Word data is initialized on each call to match(), mostly by init(). @@ -78,7 +118,7 @@ private: int WordN; // Length char LowWord[MaxWord]; // Word in lowercase CharRole WordRole[MaxWord]; // Word segmentation info - int WordTypeSet; // Bitmask of 1<<CharType for all Word characters + CharTypeSet WordTypeSet; // Bitmask of 1<<CharType for all Word characters bool WordContainsPattern; // Simple substring check // Cumulative best-match score table. Modified: clang-tools-extra/trunk/unittests/clangd/FuzzyMatchTests.cpp URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/unittests/clangd/FuzzyMatchTests.cpp?rev=337527&r1=337526&r2=337527&view=diff ============================================================================== --- clang-tools-extra/trunk/unittests/clangd/FuzzyMatchTests.cpp (original) +++ clang-tools-extra/trunk/unittests/clangd/FuzzyMatchTests.cpp Fri Jul 20 01:01:37 2018 @@ -273,6 +273,29 @@ TEST(FuzzyMatch, Scoring) { EXPECT_THAT("Abs", matches("[abs]", 2.f)); } +// Returns pretty-printed segmentation of Text. +// e.g. std::basic_string --> +-- +---- +----- +std::string segment(StringRef Text) { + std::vector<CharRole> Roles(Text.size()); + calculateRoles(Text, Roles); + std::string Printed; + for (unsigned I = 0; I < Text.size(); ++I) + Printed.push_back("?-+ "[static_cast<unsigned>(Roles[I])]); + return Printed; +} + +// this is a no-op hack so clang-format will vertically align our testcases. +StringRef returns(StringRef Text) { return Text; } + +TEST(FuzzyMatch, Segmentation) { + EXPECT_THAT(segment("std::basic_string"), // + returns("+-- +---- +-----")); + EXPECT_THAT(segment("XMLHttpRequest"), // + returns("+--+---+------")); + EXPECT_THAT(segment("t3h PeNgU1N oF d00m!!!!!!!!"), // + returns("+-- +-+-+-+ ++ +--- ")); +} + } // namespace } // namespace clangd } // namespace clang _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits