On Tue, Mar 02, 2021 at 11:59:39AM +1300, Barry Song wrote:
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 88a2e2b..d805e59 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7797,6 +7797,16 @@ int sched_cpu_activate(unsigned int cpu)
>       if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
>               static_branch_inc_cpuslocked(&sched_smt_present);
>  #endif
> +
> +#ifdef CONFIG_SCHED_CLUSTER
> +     /*
> +      * When going up, increment the number of cluster cpus with
> +      * cluster present.
> +      */
> +     if (cpumask_weight(cpu_cluster_mask(cpu)) > 1)
> +             static_branch_inc_cpuslocked(&sched_cluster_present);
> +#endif
> +
>       set_cpu_active(cpu, true);
>  
>       if (sched_smp_initialized) {
> @@ -7873,6 +7883,14 @@ int sched_cpu_deactivate(unsigned int cpu)
>               static_branch_dec_cpuslocked(&sched_smt_present);
>  #endif
>  
> +#ifdef CONFIG_SCHED_CLUSTER
> +     /*
> +      * When going down, decrement the number of cpus with cluster present.
> +      */
> +     if (cpumask_weight(cpu_cluster_mask(cpu)) > 1)
> +             static_branch_dec_cpuslocked(&sched_cluster_present);
> +#endif
> +
>       if (!sched_smp_initialized)
>               return 0;

I don't think that's correct. IIUC this will mean the
sched_cluster_present thing will be enabled on anything with SMT (very
much including x86 big cores after the next patch).

I'm thinking that at the very least you should check a CLS domain
exists, but that might be hard at this point, because the sched domains
haven't been build yet.

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 8a8bd7b..3db7b07 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6009,6 +6009,11 @@ static inline int __select_idle_cpu(int cpu)
>       return -1;
>  }
>  
> +#ifdef CONFIG_SCHED_CLUSTER
> +DEFINE_STATIC_KEY_FALSE(sched_cluster_present);
> +EXPORT_SYMBOL_GPL(sched_cluster_present);

I really rather think this shouldn't be exported

> +#endif
> +
>  #ifdef CONFIG_SCHED_SMT
>  DEFINE_STATIC_KEY_FALSE(sched_smt_present);
>  EXPORT_SYMBOL_GPL(sched_smt_present);

This is a KVM wart, it needs to know because mitigation crap.

> @@ -6116,6 +6121,26 @@ static inline int select_idle_core(struct task_struct 
> *p, int core, struct cpuma
>  
>  #endif /* CONFIG_SCHED_SMT */
>  
> +static inline int _select_idle_cpu(bool smt, struct task_struct *p, int 
> target, struct cpumask *cpus, int *idle_cpu, int *nr)
> +{
> +     int cpu, i;
> +
> +     for_each_cpu_wrap(cpu, cpus, target) {
> +             if (smt) {
> +                     i = select_idle_core(p, cpu, cpus, idle_cpu);
> +             } else {
> +                     if (!--*nr)
> +                             return -1;
> +                     i = __select_idle_cpu(cpu);
> +             } 
> +
> +             if ((unsigned int)i < nr_cpumask_bits)
> +                     return i;
> +     }
> +
> +     return -1;
> +}
> +
>  /*
>   * Scan the LLC domain for idle CPUs; this is dynamically regulated by
>   * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
> @@ -6124,7 +6149,7 @@ static inline int select_idle_core(struct task_struct 
> *p, int core, struct cpuma
>  static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, 
> int target)
>  {
>       struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
> -     int i, cpu, idle_cpu = -1, nr = INT_MAX;
> +     int i, idle_cpu = -1, nr = INT_MAX;
>       bool smt = test_idle_cores(target, false);
>       int this = smp_processor_id();
>       struct sched_domain *this_sd;
> @@ -6134,7 +6159,12 @@ static int select_idle_cpu(struct task_struct *p, 
> struct sched_domain *sd, int t
>       if (!this_sd)
>               return -1;
>  
> -     cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> +     if (!sched_cluster_active())
> +             cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> +#ifdef CONFIG_SCHED_CLUSTER
> +     if (sched_cluster_active())
> +             cpumask_and(cpus, cpu_cluster_mask(target), p->cpus_ptr);
> +#endif
>  
>       if (sched_feat(SIS_PROP) && !smt) {
>               u64 avg_cost, avg_idle, span_avg;
> @@ -6155,24 +6185,32 @@ static int select_idle_cpu(struct task_struct *p, 
> struct sched_domain *sd, int t
>               time = cpu_clock(this);
>       }
>  
> -     for_each_cpu_wrap(cpu, cpus, target) {
> -             if (smt) {
> -                     i = select_idle_core(p, cpu, cpus, &idle_cpu);
> -                     if ((unsigned int)i < nr_cpumask_bits)
> -                             return i;
> +     /* scan cluster before scanning the whole llc */
> +#ifdef CONFIG_SCHED_CLUSTER
> +     if (sched_cluster_active()) {
> +             i = _select_idle_cpu(smt, p, target, cpus, &idle_cpu, &nr);
> +             if ((unsigned int) i < nr_cpumask_bits) {
> +                     idle_cpu = i;
> +                     goto done;
> +             } else if (nr <= 0)
> +                     return -1;
>  
> -             } else {
> -                     if (!--nr)
> -                             return -1;
> -                     idle_cpu = __select_idle_cpu(cpu);
> -                     if ((unsigned int)idle_cpu < nr_cpumask_bits)
> -                             break;
> -             }
> +             cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> +             cpumask_andnot(cpus, cpus, cpu_cluster_mask(target));
>       }
> +#endif
> +
> +     i = _select_idle_cpu(smt, p, target, cpus, &idle_cpu, &nr);
> +     if ((unsigned int) i < nr_cpumask_bits) {
> +             idle_cpu = i;
> +             goto done;
> +     } else if (nr <= 0)
> +             return -1;
>  
>       if (smt)
>               set_idle_cores(this, false);
>  
> +done:
>       if (sched_feat(SIS_PROP) && !smt) {
>               time = cpu_clock(this) - time;
>               update_avg(&this_sd->avg_scan_cost, time);

And this is just horrific :-(

Reply via email to