Screenshot: https://dmalcolm.fedorapeople.org/gcc/2015-09-10/spellcheck.html
There are a couple of FIXMEs here: * where to call levenshtein_distance_unit_tests * should we attempt error-recovery in c-typeck.c:build_component_ref gcc/ChangeLog: * Makefile.in (OBJS): Add spellcheck.o. * spellcheck.c: New file. * spellcheck.h: New file. gcc/c-family/ChangeLog: * c-common.h (lookup_name_fuzzy): New decl. gcc/c/ChangeLog: * c-decl.c: Include spellcheck.h. (lookup_name_fuzzy): New. * c-parser.c: Include spellcheck.h. (c_parser_declaration_or_fndef): If "unknown type name", attempt to suggest a close match using lookup_name_fuzzy. (c_parser_postfix_expression): Pass source range in calls to build_component_ref. (c_parser_postfix_expression_after_primary): Likewise. * c-tree.h (build_component_ref): Add source_range * param. * c-typeck.c: Include gcc-rich-location.h and spellcheck.h. (lookup_field_fuzzy_find_candidates): New function. (lookup_field_fuzzy): New function. (build_component_ref): Add "ident_range" param, use it when printing field-not-found error. Use lookup_field_fuzzy to suggest close matches. gcc/objc/ChangeLog: * objc-act.c (objc_build_component_ref): Pass NULL to new param of build_component_ref. gcc/testsuite/ChangeLog: * gcc.dg/spellcheck.c: New file. --- gcc/Makefile.in | 1 + gcc/c-family/c-common.h | 1 + gcc/c/c-decl.c | 37 +++++++++++ gcc/c/c-parser.c | 30 ++++++--- gcc/c/c-tree.h | 2 +- gcc/c/c-typeck.c | 92 +++++++++++++++++++++++++++- gcc/objc/objc-act.c | 3 +- gcc/spellcheck.c | 126 ++++++++++++++++++++++++++++++++++++++ gcc/spellcheck.h | 32 ++++++++++ gcc/testsuite/gcc.dg/spellcheck.c | 36 +++++++++++ 10 files changed, 347 insertions(+), 13 deletions(-) create mode 100644 gcc/spellcheck.c create mode 100644 gcc/spellcheck.h create mode 100644 gcc/testsuite/gcc.dg/spellcheck.c diff --git a/gcc/Makefile.in b/gcc/Makefile.in index c472696..7fbd80a 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1392,6 +1392,7 @@ OBJS = \ shrink-wrap.o \ simplify-rtx.o \ sparseset.o \ + spellcheck.o \ sreal.o \ stack-ptr-mod.o \ statistics.o \ diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h index b9a5d72..1903574 100644 --- a/gcc/c-family/c-common.h +++ b/gcc/c-family/c-common.h @@ -972,6 +972,7 @@ extern tree finish_label_address_expr (tree, location_t); different implementations. Used in c-common.c. */ extern tree lookup_label (tree); extern tree lookup_name (tree); +extern tree lookup_name_fuzzy (tree); extern bool lvalue_p (const_tree); extern bool vector_targets_convertible_p (const_tree t1, const_tree t2); diff --git a/gcc/c/c-decl.c b/gcc/c/c-decl.c index b7f0241..778c935 100644 --- a/gcc/c/c-decl.c +++ b/gcc/c/c-decl.c @@ -64,6 +64,7 @@ along with GCC; see the file COPYING3. If not see #include "c-family/c-ada-spec.h" #include "cilk.h" #include "builtins.h" +#include "spellcheck.h" /* In grokdeclarator, distinguish syntactic contexts of declarators. */ enum decl_context @@ -3900,6 +3901,42 @@ lookup_name_in_scope (tree name, struct c_scope *scope) return b->decl; return 0; } + +/* Look for the closest match for NAME within the currently valid + scopes. + + This finds the identifier with the lowest Levenshtein distance to + NAME. If there are multiple candidates with equal minimal distance, + the first one found is returned. Scopes are searched from innermost + outwards, and within a scope in reverse order of declaration, thus + benefiting candidates "near" to the current scope. */ + +tree +lookup_name_fuzzy (tree name) +{ + gcc_assert (TREE_CODE (name) == IDENTIFIER_NODE); + + c_binding *best_binding = NULL; + int best_distance = INT_MAX; + + for (c_scope *scope = current_scope; scope; scope = scope->outer) + for (c_binding *binding = scope->bindings; binding; binding = binding->prev) + { + if (!binding->id) + continue; + int dist = levenshtein_distance (name, binding->id); + if (dist < best_distance) + { + best_distance = dist; + best_binding = binding; + } + } + if (best_binding) + return best_binding->id; + else + return NULL; +} + /* Create the predefined scalar types of C, and some nodes representing standard constants (0, 1, (void *) 0). diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index 0c62496..d134d85 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -66,6 +66,7 @@ along with GCC; see the file COPYING3. If not see #include "builtins.h" #include "gomp-constants.h" #include "c-family/c-indentation.h" +#include "spellcheck.h" /* Initialization routine for this file. */ @@ -1543,8 +1544,14 @@ c_parser_declaration_or_fndef (c_parser *parser, bool fndef_ok, || c_parser_peek_2nd_token (parser)->type == CPP_MULT) && (!nested || !lookup_name (c_parser_peek_token (parser)->value))) { - error_at (here, "unknown type name %qE", - c_parser_peek_token (parser)->value); + tree hint = lookup_name_fuzzy (c_parser_peek_token (parser)->value); + if (hint) + error_at (here, "unknown type name %qE; did you mean %qE?", + c_parser_peek_token (parser)->value, + hint); + else + error_at (here, "unknown type name %qE", + c_parser_peek_token (parser)->value); /* Parse declspecs normally to get a correct pointer type, but avoid a further "fails to be a type name" error. Refuse nested functions @@ -7425,7 +7432,8 @@ c_parser_postfix_expression (c_parser *parser) if (c_parser_next_token_is (parser, CPP_NAME)) { offsetof_ref = build_component_ref - (loc, offsetof_ref, c_parser_peek_token (parser)->value); + (loc, offsetof_ref, c_parser_peek_token (parser)->value, + &c_parser_peek_token (parser)->range); c_parser_consume_token (parser); while (c_parser_next_token_is (parser, CPP_DOT) || c_parser_next_token_is (parser, @@ -7453,7 +7461,8 @@ c_parser_postfix_expression (c_parser *parser) } offsetof_ref = build_component_ref (loc, offsetof_ref, - c_parser_peek_token (parser)->value); + c_parser_peek_token (parser)->value, + &c_parser_peek_token (parser)->range); c_parser_consume_token (parser); } else @@ -7913,6 +7922,7 @@ c_parser_postfix_expression_after_primary (c_parser *parser, vec<location_t> arg_loc = vNULL; location_t start; location_t finish; + source_range ident_range; while (true) { @@ -8031,10 +8041,12 @@ c_parser_postfix_expression_after_primary (c_parser *parser, expr.original_type = NULL; return expr; } + ident_range = c_parser_peek_token (parser)->range; start = EXPR_LOCATION_RANGE (expr.value).m_start; - finish = c_parser_peek_token (parser)->range.m_finish; + finish = ident_range.m_finish; c_parser_consume_token (parser); - expr.value = build_component_ref (op_loc, expr.value, ident); + expr.value = build_component_ref (op_loc, expr.value, ident, + &ident_range); set_source_range (&expr.value, start, finish); expr.original_code = ERROR_MARK; if (TREE_CODE (expr.value) != COMPONENT_REF) @@ -8063,14 +8075,16 @@ c_parser_postfix_expression_after_primary (c_parser *parser, expr.original_type = NULL; return expr; } + ident_range = c_parser_peek_token (parser)->range; start = EXPR_LOCATION_RANGE (expr.value).m_start; - finish = c_parser_peek_token (parser)->range.m_finish; + finish = ident_range.m_finish; c_parser_consume_token (parser); expr.value = build_component_ref (op_loc, build_indirect_ref (op_loc, expr.value, RO_ARROW), - ident); + ident, + &ident_range); set_source_range (&expr.value, start, finish); expr.original_code = ERROR_MARK; if (TREE_CODE (expr.value) != COMPONENT_REF) diff --git a/gcc/c/c-tree.h b/gcc/c/c-tree.h index 0810a74..4c3c637 100644 --- a/gcc/c/c-tree.h +++ b/gcc/c/c-tree.h @@ -593,7 +593,7 @@ extern struct c_expr convert_lvalue_to_rvalue (location_t, struct c_expr, bool, bool); extern void mark_exp_read (tree); extern tree composite_type (tree, tree); -extern tree build_component_ref (location_t, tree, tree); +extern tree build_component_ref (location_t, tree, tree, source_range *); extern tree build_array_ref (location_t, tree, tree); extern tree build_external_ref (source_range, tree, int, tree *); extern void pop_maybe_used (bool); diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c index 506abb3..507400b 100644 --- a/gcc/c/c-typeck.c +++ b/gcc/c/c-typeck.c @@ -54,6 +54,8 @@ along with GCC; see the file COPYING3. If not see #include "c-family/c-ubsan.h" #include "cilk.h" #include "gomp-constants.h" +#include "gcc-rich-location.h" +#include "spellcheck.h" /* Possible cases of implicit bad conversions. Used to select diagnostic messages in convert_for_assignment. */ @@ -2259,12 +2261,72 @@ lookup_field (tree type, tree component) return tree_cons (NULL_TREE, field, NULL_TREE); } +/* Recursively append candidate IDENTIFIER_NODEs to CANDIDATES. */ + +static void +lookup_field_fuzzy_find_candidates (tree type, tree component, + vec<tree> *candidates) +{ + tree field; + for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) + { + if (DECL_NAME (field) == NULL_TREE + && (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE + || TREE_CODE (TREE_TYPE (field)) == UNION_TYPE)) + { + lookup_field_fuzzy_find_candidates (TREE_TYPE (field), + component, + candidates); + } + + if (DECL_NAME (field)) + candidates->safe_push (field); + } +} + +/* Like "lookup_field", but find the closest match, rather than + necessarily an exact match. */ + +static tree +lookup_field_fuzzy (tree type, tree component) +{ + gcc_assert (TREE_CODE (component) == IDENTIFIER_NODE); + + /* FIXME: move this to a unittest suite. */ + levenshtein_distance_unit_tests (); + + /* First, gather a list of candidates. */ + auto_vec <tree> candidates; + + lookup_field_fuzzy_find_candidates (type, component, + &candidates); + + /* Now determine which is closest. */ + int i; + tree field; + tree best_field = NULL; + int best_distance = INT_MAX; + FOR_EACH_VEC_ELT (candidates, i, field) + { + int dist = levenshtein_distance (component, DECL_NAME (field)); + if (dist < best_distance) + { + best_distance = dist; + best_field = field; + } + } + + return best_field; +} + /* Make an expression to refer to the COMPONENT field of structure or union value DATUM. COMPONENT is an IDENTIFIER_NODE. LOC is the - location of the COMPONENT_REF. */ + location of the COMPONENT_REF. IDENT_RANGE points to the source range of + the identifier, or is NULL. */ tree -build_component_ref (location_t loc, tree datum, tree component) +build_component_ref (location_t loc, tree datum, tree component, + source_range *ident_range) { tree type = TREE_TYPE (datum); enum tree_code code = TREE_CODE (type); @@ -2294,7 +2356,31 @@ build_component_ref (location_t loc, tree datum, tree component) if (!field) { - error_at (loc, "%qT has no member named %qE", type, component); + if (!ident_range) + { + error_at (loc, "%qT has no member named %qE", + type, component); + return error_mark_node; + } + gcc_rich_location richloc (*ident_range); + if (TREE_CODE (datum) == INDIRECT_REF) + richloc.add_expr (TREE_OPERAND (datum, 0)); + else + richloc.add_expr (datum); + field = lookup_field_fuzzy (type, component); + if (field) + { + error_at_rich_loc + (&richloc, + "%qT has no member named %qE; did you mean %qE?", + type, component, field); + /* FIXME: error recovery: should we try to keep going, + with "field"? (having issued an error, and hence no + output). */ + } + else + error_at_rich_loc (&richloc, "%qT has no member named %qE", + type, component); return error_mark_node; } diff --git a/gcc/objc/objc-act.c b/gcc/objc/objc-act.c index a1e32fc..04f2824 100644 --- a/gcc/objc/objc-act.c +++ b/gcc/objc/objc-act.c @@ -2665,7 +2665,8 @@ objc_build_component_ref (tree datum, tree component) return finish_class_member_access_expr (datum, component, false, tf_warning_or_error); #else - return build_component_ref (input_location, datum, component); + return build_component_ref (input_location, datum, component, + NULL); #endif } diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c new file mode 100644 index 0000000..2892381 --- /dev/null +++ b/gcc/spellcheck.c @@ -0,0 +1,126 @@ +/* Find near-matches for strings and identifiers. + Copyright (C) 2015 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "tree.h" +#include "spellcheck.h" + +/* The Levenshtein distance is an "edit-distance": the minimal + number of one-character insertions, removals or substitutions + that are needed to change one string into another. + + This implementation uses the Wagner-Fischer algorithm. + + It is currently a naive implementation, various optimizations are + possible (e.g. removal of the M*N allocation in favor of a pair + of allocations of M and N). */ + +static int +levenshtein_distance (const char *s, int m, + const char *t, int n) +{ + const bool debug = false; + + int *d = new int [(m + 1) * (n + 1)]; + if (debug) + memset (d, 0xab, (m + 1) * (n + 1) * sizeof (int)); + +#define D(I, J) (d[((I) * (n + 1)) + (J)]) + + for (int i = 0; i <= m; i++) + D(i, 0) = i; + for (int j = 0; j <= n; j++) + D(0, j) = j; + + for (int j = 1; j <= n; j++) + for (int i = 1; i <= m; i++) + if (s[i - 1] == t[j - 1]) + D(i, j) = D(i - 1, j - 1); + else + { + int deletion = D(i - 1, j ) + 1; + int insertion = D(i, j - 1) + 1; + int substitution = D(i - 1, j - 1) + 1; + int cheapest = MIN (deletion, insertion); + cheapest = MIN (cheapest, substitution); + D(i, j) = cheapest; + } + + if (debug) + { + printf ("s=\"%s\" t=\"%s\"\n", s, t); + for (int j = 0; j <= n; j++) + { + for (int i = 0; i <= m; i++) + printf ("%i ", D(i, j)); + printf ("\n"); + } + } + + int result = D(m, n); +#undef D + + delete[] d; + + return result; +} + +/* Calculate Levenshtein distance between two nil-terminated strings. + This exists purely for the unit tests. */ + +int +levenshtein_distance (const char *s, const char *t) +{ + return levenshtein_distance (s, strlen (s), t, strlen (t)); +} + +/* Unit tests for levenshtein_distance. */ + +void +levenshtein_distance_unit_tests (void) +{ + gcc_assert (levenshtein_distance ("", "nonempty") == strlen ("nonempty")); + gcc_assert (levenshtein_distance ("nonempty", "") == strlen ("nonempty")); + gcc_assert (levenshtein_distance ("saturday", "sunday") == 3); + gcc_assert (levenshtein_distance ("sunday", "saturday") == 3); + gcc_assert (levenshtein_distance ("foo", "m_foo") == 2); + gcc_assert (levenshtein_distance ("m_foo", "foo") == 2); + gcc_assert (levenshtein_distance ("hello_world", "HelloWorld") == 3); + gcc_assert (levenshtein_distance + ("the quick brown fox jumps over the lazy dog", "dog") == 40); + gcc_assert (levenshtein_distance + ("the quick brown fox jumps over the lazy dog", "fox") == 40); +} + +/* Calculate Levenshtein distance between two identifiers. */ + +int +levenshtein_distance (tree ident_s, tree ident_t) +{ + gcc_assert (TREE_CODE (ident_s) == IDENTIFIER_NODE); + gcc_assert (TREE_CODE (ident_t) == IDENTIFIER_NODE); + + return levenshtein_distance (IDENTIFIER_POINTER (ident_s), + IDENTIFIER_LENGTH (ident_s), + IDENTIFIER_POINTER (ident_t), + IDENTIFIER_LENGTH (ident_t)); +} diff --git a/gcc/spellcheck.h b/gcc/spellcheck.h new file mode 100644 index 0000000..7900aa2 --- /dev/null +++ b/gcc/spellcheck.h @@ -0,0 +1,32 @@ +/* Find near-matches for strings and identifiers. + Copyright (C) 2015 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +#ifndef GCC_SPELLCHECK_H +#define GCC_SPELLCHECK_H + +extern void +levenshtein_distance_unit_tests (void); + +extern int +levenshtein_distance (const char *s, const char *t); + +extern int +levenshtein_distance (tree ident_s, tree ident_t); + +#endif /* GCC_SPELLCHECK_H */ diff --git a/gcc/testsuite/gcc.dg/spellcheck.c b/gcc/testsuite/gcc.dg/spellcheck.c new file mode 100644 index 0000000..e34ade8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/spellcheck.c @@ -0,0 +1,36 @@ +/* { dg-do compile } */ +/* { dg-options "-fdiagnostics-show-caret" } */ + +struct foo +{ + int foo; + int bar; + int baz; +}; + +int test (struct foo *ptr) +{ + return ptr->m_bar; /* { dg-error "'struct foo' has no member named 'm_bar'; did you mean 'bar'?" } */ + +/* { dg-begin-multiline-output "" } + return ptr->m_bar; + ~~~ ^~~~~ + { dg-end-multiline-output "" } */ +} + +int test2 (void) +{ + struct foo instance = {}; + return instance.m_bar; /* { dg-error "'struct foo' has no member named 'm_bar'; did you mean 'bar'?" } */ + +/* { dg-begin-multiline-output "" } + return instance.m_bar; + ~~~~~~~~ ^~~~~ + { dg-end-multiline-output "" } */ +} + +int64 foo; /* { dg-error "unknown type name 'int64'; did you mean 'int'?" } */ +/* { dg-begin-multiline-output "" } + int64 foo; + ^~~~~ + { dg-end-multiline-output "" } */ -- 1.8.5.3