patch 9.2.0635: checking the syntax contains/cluster list is slow

Commit: 
https://github.com/vim/vim/commit/48fbae4378ab9bb6e7fcbd2480544847eac10cd7
Author: Hirohito Higashi <[email protected]>
Date:   Sat Jun 13 18:41:28 2026 +0000

    patch 9.2.0635: checking the syntax contains/cluster list is slow
    
    Problem:  Deciding whether a group is in a "contains"/cluster list scans
              the list and expands clusters on every check, which is slow for
              syntaxes with large lists (e.g. plugins such as netrw).
    Solution: Resolve each list once into a sorted, cluster-expanded set of
              group IDs and use a binary search; cache it per syntax block and
              drop the cache when syntax definitions change (Hirohito Higashi).
    
    closes: #20490
    
    Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>
    Signed-off-by: Hirohito Higashi <[email protected]>
    Signed-off-by: Christian Brabandt <[email protected]>

diff --git a/runtime/doc/testing.txt b/runtime/doc/testing.txt
index 24e3a4aa2..8e8831b42 100644
--- a/runtime/doc/testing.txt
+++ b/runtime/doc/testing.txt
@@ -409,6 +409,8 @@ test_override({name}, {val})                                
*test_override()*
                redraw       disable the redrawing() function.
                redraw_flag  ignore the RedrawingDisabled flag.
                starting     reset the "starting" variable, see below.
+               syn_idlist_cache        disable the cache for resolving
+                                       syntax "contains"/cluster lists.
                term_props   reset all terminal properties when the version
                             string is detected.
                ui_delay     time in msec to use in ui_delay(); overrules a
diff --git a/src/globals.h b/src/globals.h
index 96d678545..855072523 100644
--- a/src/globals.h
+++ b/src/globals.h
@@ -2007,6 +2007,7 @@ EXTERN int  disable_char_avail_for_testing INIT(= FALSE);
 EXTERN int  disable_redraw_for_testing INIT(= FALSE);
 EXTERN int  ignore_redraw_flag_for_testing INIT(= FALSE);
 EXTERN int  nfa_fail_for_testing INIT(= FALSE);
+EXTERN int  disable_syn_idlist_cache_for_testing INIT(= FALSE);
 EXTERN int  no_query_mouse_for_testing INIT(= FALSE);
 EXTERN int  ui_delay_for_testing INIT(= 0);
 EXTERN int  reset_term_props_on_termresponse INIT(= FALSE);
diff --git a/src/structs.h b/src/structs.h
index c831e3341..92d4441be 100644
--- a/src/structs.h
+++ b/src/structs.h
@@ -3193,6 +3193,9 @@ typedef struct {
     int                b_sst_freecount;
     linenr_T   b_sst_check_lnum;
     short_u    b_sst_lasttick; // last display tick
+
+    // Cache for in_id_list(); see idl_cache_T in syntax.c.
+    void       *b_idlist_cache;
 #endif // FEAT_SYN_HL
 
 #ifdef FEAT_SPELL
diff --git a/src/syntax.c b/src/syntax.c
index c845a1051..4333c3183 100644
--- a/src/syntax.c
+++ b/src/syntax.c
@@ -345,6 +345,7 @@ static void init_syn_patterns(void);
 static char_u *get_syn_pattern(char_u *arg, synpat_T *ci);
 static int get_id_list(char_u **arg, int keylen, short **list, int skip);
 static void syn_combine_list(short **clstr1, short **clstr2, int list_op);
+static void idl_cache_clear(synblock_T *block);
 
 /*
  * Start the syntax recognition for a line.  This function is normally called
@@ -1027,6 +1028,9 @@ syn_stack_free_block(synblock_T *block)
 {
     synstate_T *p;
 
+    // Syntax definitions may have changed: drop the in_id_list() cache.
+    idl_cache_clear(block);
+
     if (block->b_sst_array == NULL)
        return;
 
@@ -6110,6 +6114,194 @@ copy_id_list(short *list)
     return retval;
 }
 
+/*
+ * Cache for in_id_list() {{{
+ *
+ * A "contains"/cluster list is otherwise scanned with recursive cluster
+ * expansion on every check.  Each distinct list is resolved once into a
+ * cluster-expanded, sorted set of group IDs so membership becomes a binary
+ * search.  The cache lives in the synblock and is cleared by
+ * syn_stack_free_block() when syntax definitions change (which also covers
+ * list pointer reuse).
+ */
+#define IDL_CACHE_SIZE 16
+
+typedef struct
+{
+    short      *idl_key;       // the list this entry resolves (NULL when 
empty)
+    short      *idl_ids;       // sorted, cluster-expanded group IDs 
(allocated)
+    int                idl_count;      // number of IDs in idl_ids
+    short      idl_marker;     // leading ALLBUT/TOP/CONTAINED item, or 0
+    bool       idl_usable;     // false: not cacheable, use the slow path
+} idl_entry_T;
+
+typedef struct
+{
+    idl_entry_T        ent[IDL_CACHE_SIZE];
+    int                round;          // round-robin replacement index
+    int                last;           // index of the last returned entry 
(fast path)
+} idl_cache_T;
+
+/*
+ * Collect the explicit group IDs of "list" into "ga", expanding clusters.
+ * "seen" records visited clusters to break cycles and avoid duplicate work.
+ * Return false when the list cannot be represented as a plain set (a nested
+ * ALLBUT/TOP/CONTAINED marker) or on out of memory.
+ */
+    static bool
+idl_collect(short *list, garray_T *ga, garray_T *seen)
+{
+    if (list == NULL || list == ID_LIST_ALL)
+       return false;
+    for (short item = *list; item != 0; item = *++list)
+    {
+       if (item >= SYNID_ALLBUT && item < SYNID_CLUSTER)
+           return false;       // nested ALLBUT/TOP/CONTAINED: not cacheable
+       if (item >= SYNID_CLUSTER)
+       {
+           short       *scl_list;
+           bool        seen_it = false;
+
+           for (int i = 0; i < seen->ga_len; ++i)
+               if (((short *)seen->ga_data)[i] == item)
+               {
+                   seen_it = true;
+                   break;
+               }
+           if (seen_it)
+               continue;
+           if (ga_grow(seen, 1) == FAIL)
+               return false;
+           ((short *)seen->ga_data)[seen->ga_len++] = item;
+           // Record the cluster id itself, not just its members: the slow
+           // path matches "item == id" for a cluster id present in the list.
+           if (ga_grow(ga, 1) == FAIL)
+               return false;
+           ((short *)ga->ga_data)[ga->ga_len++] = item;
+           scl_list = SYN_CLSTR(syn_block)[item - SYNID_CLUSTER].scl_list;
+           if (scl_list != NULL && !idl_collect(scl_list, ga, seen))
+               return false;
+       }
+       else
+       {
+           if (ga_grow(ga, 1) == FAIL)
+               return false;
+           ((short *)ga->ga_data)[ga->ga_len++] = item;
+       }
+    }
+    return true;
+}
+
+    static int
+idl_id_cmp(const void *a, const void *b)
+{
+    return (int)*(const short *)a - (int)*(const short *)b;
+}
+
+/*
+ * Get the cache entry that resolves "list", building it if needed.
+ * Returns NULL only on allocation failure.
+ */
+    static idl_entry_T *
+idl_get_entry(short *list)
+{
+    idl_cache_T        *cache = (idl_cache_T *)syn_block->b_idlist_cache;
+    idl_entry_T        *e;
+    garray_T   ga, seen;
+
+    if (cache == NULL)
+    {
+       cache = ALLOC_CLEAR_ONE(idl_cache_T);
+       if (cache == NULL)
+           return NULL;
+       syn_block->b_idlist_cache = cache;
+    }
+    // Fast path: the same list is usually queried many times in a row.
+    if (cache->ent[cache->last].idl_key == list)
+       return &cache->ent[cache->last];
+    for (int i = 0; i < IDL_CACHE_SIZE; ++i)
+       if (cache->ent[i].idl_key == list)
+       {
+           cache->last = i;
+           return &cache->ent[i];
+       }
+
+    // Build a new entry, replacing one in round-robin order.
+    cache->last = cache->round;
+    e = &cache->ent[cache->round];
+    cache->round = (cache->round + 1) % IDL_CACHE_SIZE;
+    VIM_CLEAR(e->idl_ids);
+    e->idl_count = 0;
+    e->idl_key = list;
+    e->idl_marker = 0;
+    e->idl_usable = true;
+
+    // A leading ALLBUT/TOP/CONTAINED marker is kept aside; the rest is the 
set.
+    if (*list >= SYNID_ALLBUT && *list < SYNID_CLUSTER)
+    {
+       e->idl_marker = *list;
+       ++list;
+    }
+
+    ga_init2(&ga, sizeof(short), 50);
+    ga_init2(&seen, sizeof(short), 10);
+    if (!idl_collect(list, &ga, &seen))
+    {
+       e->idl_usable = false;
+       ga_clear(&ga);
+       ga_clear(&seen);
+       return e;
+    }
+    ga_clear(&seen);
+    if (ga.ga_len > 0)
+    {
+       qsort(ga.ga_data, (size_t)ga.ga_len, sizeof(short), idl_id_cmp);
+       e->idl_ids = (short *)ga.ga_data;       // take ownership
+       e->idl_count = ga.ga_len;
+    }
+    else
+       ga_clear(&ga);
+    return e;
+}
+
+/*
+ * Return true if "id" is in the resolved set of cache entry "e".
+ */
+    static bool
+idl_contains(idl_entry_T *e, short id)
+{
+    int        lo = 0;
+    int        hi = e->idl_count - 1;
+
+    while (lo <= hi)
+    {
+       int     mid = (lo + hi) / 2;
+       short   v = e->idl_ids[mid];
+
+       if (v == id)
+           return true;
+       if (v < id)
+           lo = mid + 1;
+       else
+           hi = mid - 1;
+    }
+    return false;
+}
+
+    static void
+idl_cache_clear(synblock_T *block)
+{
+    idl_cache_T        *cache = (idl_cache_T *)block->b_idlist_cache;
+
+    if (cache == NULL)
+       return;
+    for (int i = 0; i < IDL_CACHE_SIZE; ++i)
+       vim_free(cache->ent[i].idl_ids);
+    vim_free(cache);
+    block->b_idlist_cache = NULL;
+}
+// }}}
+
 /*
  * Check if syntax group "ssp" is in the ID list "list" of "cur_si".
  * "cur_si" can be NULL if not checking the "containedin" list.
@@ -6131,6 +6323,7 @@ in_id_list(
     static int depth = 0;
     int                r;
     int                toplevel;
+    idl_entry_T        *e;
 
     // If ssp has a "containedin" list and "cur_si" is in it, return TRUE.
     if (cur_si != NULL && ssp->cont_in_list != NULL
@@ -6166,6 +6359,42 @@ in_id_list(
     toplevel = !(flags & HL_CONTAINED) || (flags & HL_INCLUDED_TOPLEVEL);
 
     /*
+     * Fast path: use the resolved, sorted set cached for this list.  Only the
+     * leading ALLBUT/TOP/CONTAINED gate depends on "ssp"/"flags", so it is
+     * applied here; membership itself is a binary search.  Can be turned off
+     * with test_override('syn_idlist_cache', 1) to verify the result matches
+     * the slow path.
+     */
+    e = disable_syn_idlist_cache_for_testing ? NULL : idl_get_entry(list);
+    if (e != NULL && e->idl_usable)
+    {
+       if (e->idl_marker != 0)
+       {
+           item = e->idl_marker;
+           if (item < SYNID_TOP)
+           {
+               if (item - SYNID_ALLBUT != ssp->inc_tag)
+                   return FALSE;
+           }
+           else if (item < SYNID_CONTAINED)
+           {
+               if (item - SYNID_TOP != ssp->inc_tag || !toplevel)
+                   return FALSE;
+           }
+           else
+           {
+               if (item - SYNID_CONTAINED != ssp->inc_tag || toplevel)
+                   return FALSE;
+           }
+           retval = FALSE;
+       }
+       else
+           retval = TRUE;
+       return idl_contains(e, id) ? retval : !retval;
+    }
+
+    /*
+     * Slow path (uncacheable list, e.g. a nested ALLBUT).
      * If the first item is "ALLBUT", return TRUE if "id" is NOT in the
      * contains list.  We also require that "id" is at the same ":syn include"
      * level as the list.
diff --git a/src/testdir/test_syntax.vim b/src/testdir/test_syntax.vim
index 739fee528..170e2e3b9 100644
--- a/src/testdir/test_syntax.vim
+++ b/src/testdir/test_syntax.vim
@@ -689,6 +689,43 @@ func Test_syn_zsub()
   bw!
 endfunc
 
+func s:SynDumpBuffer(ft, lines)
+  new
+  syntax on
+  execute 'setfiletype ' .. a:ft
+  call setline(1, a:lines)
+  let result = []
+  for lnum in range(1, line('$'))
+    for col in range(1, max([1, len(getline(lnum))]))
+      call add(result, synIDattr(synID(lnum, col, 1), 'name'))
+    endfor
+  endfor
+  bwipe!
+  return result
+endfunc
+
+" The in_id_list() cache is a speed optimization only: highlighting must be
+" identical with the cache on (default) and off.  Compare full-buffer synID()
+" output both ways across filetypes that use "contains"/cluster lists.
+func Test_syntax_idlist_cache_unchanged()
+  let samples = {
+        \ 'c': ['#define M 0x1F', "char c = '\n';",
+        \       'int f(int a) { return a + 0xAB; }'],
+        \ 'vim': ['func <SID>Foo() abort', '  let x = [1, 2] + {"k": 0z1f}',
+        \         'endfunc'],
+        \ 'python': ['class C:', '    def m(self): return {1: "s", 2: r"\d"}'],
+        \ 'sh': ['case $x in', '  a) echo "${y:-0xAB}";;', 'esac'],
+        \ }
+  for ft in sort(keys(samples))
+    call test_override('syn_idlist_cache', 0)
+    let l:on = s:SynDumpBuffer(ft, samples[ft])
+    call test_override('syn_idlist_cache', 1)
+    let l:off = s:SynDumpBuffer(ft, samples[ft])
+    call test_override('ALL', 0)
+    call assert_equal(l:off, l:on, 'filetype ' .. ft)
+  endfor
+endfunc
+
 " Using \z() in a region with NFA failing should not crash.
 func Test_syn_wrong_z_one()
   new
diff --git a/src/testing.c b/src/testing.c
index eaa7c69f8..e6ba03963 100644
--- a/src/testing.c
+++ b/src/testing.c
@@ -1055,6 +1055,8 @@ f_test_override(typval_T *argvars, typval_T *rettv UNUSED)
     }
     else if (STRCMP(name, (char_u *)"nfa_fail") == 0)
        nfa_fail_for_testing = val;
+    else if (STRCMP(name, (char_u *)"syn_idlist_cache") == 0)
+       disable_syn_idlist_cache_for_testing = val;
     else if (STRCMP(name, (char_u *)"no_query_mouse") == 0)
        no_query_mouse_for_testing = val;
     else if (STRCMP(name, (char_u *)"no_wait_return") == 0)
@@ -1081,6 +1083,7 @@ f_test_override(typval_T *argvars, typval_T *rettv UNUSED)
        disable_redraw_for_testing = FALSE;
        ignore_redraw_flag_for_testing = FALSE;
        nfa_fail_for_testing = FALSE;
+       disable_syn_idlist_cache_for_testing = FALSE;
        no_query_mouse_for_testing = FALSE;
        ui_delay_for_testing = 0;
        reset_term_props_on_termresponse = FALSE;
diff --git a/src/version.c b/src/version.c
index 756793446..e99db4971 100644
--- a/src/version.c
+++ b/src/version.c
@@ -759,6 +759,8 @@ static char *(features[]) =
 
 static int included_patches[] =
 {   /* Add new patch number below this line */
+/**/
+    635,
 /**/
     634,
 /**/

-- 
-- 
You received this message from the "vim_dev" maillist.
Do not top-post! Type your reply below the text you are replying to.
For more information, visit http://www.vim.org/maillist.php

--- 
You received this message because you are subscribed to the Google Groups 
"vim_dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion visit 
https://groups.google.com/d/msgid/vim_dev/E1wYTLt-00DQUG-Il%40256bit.org.

Raspunde prin e-mail lui