Hello Srikar,

Thanks for taking a look at the patch.

On Mon, Dec 07, 2020 at 05:40:42PM +0530, Srikar Dronamraju wrote:
> * Gautham R. Shenoy <e...@linux.vnet.ibm.com> [2020-12-04 10:18:45]:
> 
> > From: "Gautham R. Shenoy" <e...@linux.vnet.ibm.com>
> 
> <snipped>
> 
> > 
> >  static int parse_thread_groups(struct device_node *dn,
> > -                          struct thread_groups *tg,
> > -                          unsigned int property)
> > +                          struct thread_groups_list *tglp)
> >  {
> > -   int i;
> > -   u32 thread_group_array[3 + MAX_THREAD_LIST_SIZE];
> > +   int i = 0;
> > +   u32 *thread_group_array;
> >     u32 *thread_list;
> >     size_t total_threads;
> > -   int ret;
> > +   int ret = 0, count;
> > +   unsigned int property_idx = 0;
> 
> NIT:
> tglx mentions in one of his recent comments to try keep a reverse fir tree
> ordering of variables where possible.

I suppose you mean moving the longer local variable declarations to to
the top and shorter ones to the bottom. Thanks. Will fix this.


> 
> > 
> > +   count = of_property_count_u32_elems(dn, "ibm,thread-groups");
> > +   thread_group_array = kcalloc(count, sizeof(u32), GFP_KERNEL);
> >     ret = of_property_read_u32_array(dn, "ibm,thread-groups",
> > -                                    thread_group_array, 3);
> > +                                    thread_group_array, count);
> >     if (ret)
> > -           return ret;
> > -
> > -   tg->property = thread_group_array[0];
> > -   tg->nr_groups = thread_group_array[1];
> > -   tg->threads_per_group = thread_group_array[2];
> > -   if (tg->property != property ||
> > -       tg->nr_groups < 1 ||
> > -       tg->threads_per_group < 1)
> > -           return -ENODATA;
> > +           goto out_free;
> > 
> > -   total_threads = tg->nr_groups * tg->threads_per_group;
> > +   while (i < count && property_idx < MAX_THREAD_GROUP_PROPERTIES) {
> > +           int j;
> > +           struct thread_groups *tg = &tglp->property_tgs[property_idx++];
> 
> NIT: same as above.

Ok.
> 
> > 
> > -   ret = of_property_read_u32_array(dn, "ibm,thread-groups",
> > -                                    thread_group_array,
> > -                                    3 + total_threads);
> > -   if (ret)
> > -           return ret;
> > +           tg->property = thread_group_array[i];
> > +           tg->nr_groups = thread_group_array[i + 1];
> > +           tg->threads_per_group = thread_group_array[i + 2];
> > +           total_threads = tg->nr_groups * tg->threads_per_group;
> > +
> > +           thread_list = &thread_group_array[i + 3];
> > 
> > -   thread_list = &thread_group_array[3];
> > +           for (j = 0; j < total_threads; j++)
> > +                   tg->thread_list[j] = thread_list[j];
> > +           i = i + 3 + total_threads;
> 
>       Can't we simply use memcpy instead?

We could. But this one makes it more explicit.


> 
> > +   }
> > 
> > -   for (i = 0 ; i < total_threads; i++)
> > -           tg->thread_list[i] = thread_list[i];
> > +   tglp->nr_properties = property_idx;
> > 
> > -   return 0;
> > +out_free:
> > +   kfree(thread_group_array);
> > +   return ret;
> >  }
> > 
> >  /*
> > @@ -805,24 +827,39 @@ static int get_cpu_thread_group_start(int cpu, struct 
> > thread_groups *tg)
> >     return -1;
> >  }
> > 
> > -static int init_cpu_l1_cache_map(int cpu)
> > +static int init_cpu_cache_map(int cpu, unsigned int cache_property)
> > 
> >  {
> >     struct device_node *dn = of_get_cpu_node(cpu, NULL);
> > -   struct thread_groups tg = {.property = 0,
> > -                              .nr_groups = 0,
> > -                              .threads_per_group = 0};
> > +   struct thread_groups *tg = NULL;
> >     int first_thread = cpu_first_thread_sibling(cpu);
> >     int i, cpu_group_start = -1, err = 0;
> > +   cpumask_var_t *mask;
> > +   struct thread_groups_list *cpu_tgl = &tgl[cpu];
> 
> NIT: same as 1st comment.

Sure, will fix this.

> 
> > 
> >     if (!dn)
> >             return -ENODATA;
> > 
> > -   err = parse_thread_groups(dn, &tg, THREAD_GROUP_SHARE_L1);
> > -   if (err)
> > -           goto out;
> > +   if (!(cache_property == THREAD_GROUP_SHARE_L1))
> > +           return -EINVAL;
> > 
> > -   cpu_group_start = get_cpu_thread_group_start(cpu, &tg);
> > +   if (!cpu_tgl->nr_properties) {
> > +           err = parse_thread_groups(dn, cpu_tgl);
> > +           if (err)
> > +                   goto out;
> > +   }
> > +
> > +   for (i = 0; i < cpu_tgl->nr_properties; i++) {
> > +           if (cpu_tgl->property_tgs[i].property == cache_property) {
> > +                   tg = &cpu_tgl->property_tgs[i];
> > +                   break;
> > +           }
> > +   }
> > +
> > +   if (!tg)
> > +           return -EINVAL;
> > +
> > +   cpu_group_start = get_cpu_thread_group_start(cpu, tg);
> 
> This whole hunk should be moved to a new function and called before
> init_cpu_cache_map. It will simplify the logic to great extent.

I suppose you are referring to the part where we select the correct
tg. Yeah, that can move to a different helper.

> 
> > 
> >     if (unlikely(cpu_group_start == -1)) {
> >             WARN_ON_ONCE(1);
> > @@ -830,11 +867,12 @@ static int init_cpu_l1_cache_map(int cpu)
> >             goto out;
> >     }
> > 
> > -   zalloc_cpumask_var_node(&per_cpu(cpu_l1_cache_map, cpu),
> > -                           GFP_KERNEL, cpu_to_node(cpu));
> > +   mask = &per_cpu(cpu_l1_cache_map, cpu);
> > +
> > +   zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cpu));
> > 
> 
> This hunk (and the next hunk) should be moved to next patch.
>

The next patch is only about introducing  THREAD_GROUP_SHARE_L2. Hence
I put in any other code in this patch, since it seems to be a logical
place to collate whatever we have in a generic form.



> >     for (i = first_thread; i < first_thread + threads_per_core; i++) {
> > -           int i_group_start = get_cpu_thread_group_start(i, &tg);
> > +           int i_group_start = get_cpu_thread_group_start(i, tg);
> > 
> >             if (unlikely(i_group_start == -1)) {
> >                     WARN_ON_ONCE(1);
> > @@ -843,7 +881,7 @@ static int init_cpu_l1_cache_map(int cpu)
> >             }
> > 
> >             if (i_group_start == cpu_group_start)
> > -                   cpumask_set_cpu(i, per_cpu(cpu_l1_cache_map, cpu));
> > +                   cpumask_set_cpu(i, *mask);
> >     }
> > 
> >  out:
> > @@ -924,7 +962,7 @@ static int init_big_cores(void)
> >     int cpu;
> > 
> >     for_each_possible_cpu(cpu) {
> > -           int err = init_cpu_l1_cache_map(cpu);
> > +           int err = init_cpu_cache_map(cpu, THREAD_GROUP_SHARE_L1);
> > 
> >             if (err)
> >                     return err;
> > -- 
> > 1.9.4
> > 
> 
> -- 
> Thanks and Regards
> Srikar Dronamraju

Reply via email to