On Wed, Jul 08, 2020 at 06:16:35PM +0300, Alexander Shishkin wrote:
> This changes perf record to close siblings' file descriptors after they
> are created, to reduce the number of open files. The trick is setting up
> mappings and applying ioctls to these guys before they get closed, which
> means a slight rework to allow these things to happen in the same loop as
> evsel creation.
> 
> Before:
> 
> > $ perf record -e '{dummy,syscalls:*}' -o before.data uname
> > Error:
> > Too many events are opened.
> > Probably the maximum number of open file descriptors has been reached.
> > Hint: Try again after reducing the number of events.
> > Hint: Try increasing the limit with 'ulimit -n <limit>'
> 
> After:
> 
> > $ perf record -e '{dummy,syscalls:*}' -o after.data uname
> > Couldn't synthesize cgroup events.
> > Linux
> > [ perf record: Woken up 5 times to write data ]
> > [ perf record: Captured and wrote 0.118 MB after.data (62 samples) ]

hi,
really nice ;-) some questions/comments below

thanks,
jirka

> 
> Signed-off-by: Alexander Shishkin <alexander.shish...@linux.intel.com>
> ---
>  tools/include/uapi/linux/perf_event.h   |  1 +
>  tools/lib/perf/evlist.c                 | 30 +++++++++++++++-
>  tools/lib/perf/evsel.c                  | 21 +++++++++++
>  tools/lib/perf/include/internal/evsel.h |  4 +++
>  tools/perf/builtin-record.c             | 48 +++++++++++++++++++------
>  tools/perf/util/cpumap.c                |  4 +++
>  tools/perf/util/evlist.c                |  4 ++-
>  tools/perf/util/evsel.c                 | 17 ++++++++-
>  tools/perf/util/evsel.h                 |  3 ++
>  9 files changed, 119 insertions(+), 13 deletions(-)
> 
> diff --git a/tools/include/uapi/linux/perf_event.h 
> b/tools/include/uapi/linux/perf_event.h
> index 7b2d6fc9e6ed..90238b8ee2ae 100644
> --- a/tools/include/uapi/linux/perf_event.h
> +++ b/tools/include/uapi/linux/perf_event.h
> @@ -1069,6 +1069,7 @@ enum perf_callchain_context {
>  #define PERF_FLAG_FD_OUTPUT          (1UL << 1)
>  #define PERF_FLAG_PID_CGROUP         (1UL << 2) /* pid=cgroup id, per-cpu 
> mode only */
>  #define PERF_FLAG_FD_CLOEXEC         (1UL << 3) /* O_CLOEXEC */
> +#define PERF_FLAG_ALLOW_CLOSE                (1UL << 4) /* XXX */
>  
>  #if defined(__LITTLE_ENDIAN_BITFIELD)
>  union perf_mem_data_src {
> diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c
> index 6a875a0f01bb..8aeb5634bc61 100644
> --- a/tools/lib/perf/evlist.c
> +++ b/tools/lib/perf/evlist.c
> @@ -403,6 +403,7 @@ perf_evlist__mmap_cb_get(struct perf_evlist *evlist, bool 
> overwrite, int idx)
>  }
>  
>  #define FD(e, x, y) (*(int *) xyarray__entry(e->fd, x, y))
> +#define OUTPUT(e, x, y) (*(int *)xyarray__entry(e->output, x, y))
>  
>  static int
>  perf_evlist__mmap_cb_mmap(struct perf_mmap *map, struct perf_mmap_param *mp,
> @@ -433,6 +434,11 @@ mmap_per_evsel(struct perf_evlist *evlist, struct 
> perf_evlist_mmap_ops *ops,
>               bool overwrite = evsel->attr.write_backward;
>               struct perf_mmap *map;
>               int *output, fd, cpu;
> +             bool do_close = false;
> +
> +             /* Skip over not yet opened evsels */
> +             if (!evsel->fd)
> +                     continue;
>  
>               if (evsel->system_wide && thread)
>                       continue;
> @@ -454,6 +460,10 @@ mmap_per_evsel(struct perf_evlist *evlist, struct 
> perf_evlist_mmap_ops *ops,
>               }
>  
>               fd = FD(evsel, cpu, thread);
> +             if (OUTPUT(evsel, cpu, thread) >= 0) {
> +                     *output = OUTPUT(evsel, cpu, thread);
> +                     continue;
> +             }

this check could be earlier in the loop to save some cycles

>  
>               if (*output == -1) {
>                       *output = fd;
> @@ -479,6 +489,10 @@ mmap_per_evsel(struct perf_evlist *evlist, struct 
> perf_evlist_mmap_ops *ops,
>                       if (!idx)
>                               perf_evlist__set_mmap_first(evlist, map, 
> overwrite);
>               } else {
> +                     if (fd < 0) {
> +                             fd = -fd;
> +                             do_close = true;
> +                     }
>                       if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, *output) != 0)
>                               return -1;
>  
> @@ -487,7 +501,7 @@ mmap_per_evsel(struct perf_evlist *evlist, struct 
> perf_evlist_mmap_ops *ops,
>  
>               revent = !overwrite ? POLLIN : 0;
>  
> -             if (!evsel->system_wide &&
> +             if (!evsel->system_wide && !do_close &&
>                   perf_evlist__add_pollfd(evlist, fd, map, revent) < 0) {
>                       perf_mmap__put(map);
>                       return -1;
> @@ -500,6 +514,13 @@ mmap_per_evsel(struct perf_evlist *evlist, struct 
> perf_evlist_mmap_ops *ops,
>                       perf_evlist__set_sid_idx(evlist, evsel, idx, cpu,
>                                                thread);
>               }
> +
> +             if (do_close) {
> +                     close(fd);
> +                     perf_mmap__put(map);
> +                     FD(evsel, cpu, thread) = -1; /* *output */
> +             }

it's also possible to define new callback in perf_evlist_mmap_ops,
and do the close there.. not sure it'd be nicer


> +             OUTPUT(evsel, cpu, thread) = *output;
>       }
>  
>       return 0;
> @@ -587,10 +608,17 @@ int perf_evlist__mmap_ops(struct perf_evlist *evlist,
>       evlist->nr_mmaps = perf_evlist__nr_mmaps(evlist);
>  
>       perf_evlist__for_each_entry(evlist, evsel) {
> +             /* Skip over not yet opened evsels */
> +             if (!evsel->fd)
> +                     continue;
> +
>               if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
>                   evsel->sample_id == NULL &&
>                   perf_evsel__alloc_id(evsel, perf_cpu_map__nr(cpus), 
> threads->nr) < 0)
>                       return -ENOMEM;
> +             if (evsel->output == NULL &&
> +                 perf_evsel__alloc_output(evsel, perf_cpu_map__nr(cpus), 
> threads->nr) < 0)
> +                     return -ENOMEM;
>       }
>  
>       if (evlist->pollfd.entries == NULL && perf_evlist__alloc_pollfd(evlist) 
> < 0)
> diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c
> index 4dc06289f4c7..6661145ca673 100644
> --- a/tools/lib/perf/evsel.c
> +++ b/tools/lib/perf/evsel.c
> @@ -55,6 +55,21 @@ int perf_evsel__alloc_fd(struct perf_evsel *evsel, int 
> ncpus, int nthreads)
>       return evsel->fd != NULL ? 0 : -ENOMEM;
>  }
>  
> +int perf_evsel__alloc_output(struct perf_evsel *evsel, int ncpus, int 
> nthreads)
> +{
> +     evsel->output = xyarray__new(ncpus, nthreads, sizeof(int));
> +
> +     if (evsel->output) {
> +             int cpu, thread;
> +             for (cpu = 0; cpu < ncpus; cpu++) {
> +                     for (thread = 0; thread < nthreads; thread++)
> +                             *(int *)xyarray__entry(evsel->output, cpu, 
> thread) = -1;
> +             }
> +     }
> +
> +     return evsel->output != NULL ? 0 : -ENOMEM;
> +}
> +
>  static int
>  sys_perf_event_open(struct perf_event_attr *attr,
>                   pid_t pid, int cpu, int group_fd,
> @@ -139,6 +154,12 @@ void perf_evsel__free_fd(struct perf_evsel *evsel)
>       evsel->fd = NULL;
>  }
>  
> +void perf_evsel__free_output(struct perf_evsel *evsel)
> +{
> +     xyarray__delete(evsel->output);
> +     evsel->output = NULL;
> +}
> +
>  void perf_evsel__close(struct perf_evsel *evsel)
>  {
>       if (evsel->fd == NULL)
> diff --git a/tools/lib/perf/include/internal/evsel.h 
> b/tools/lib/perf/include/internal/evsel.h
> index 1ffd083b235e..9bbbec7d92b1 100644
> --- a/tools/lib/perf/include/internal/evsel.h
> +++ b/tools/lib/perf/include/internal/evsel.h
> @@ -42,6 +42,7 @@ struct perf_evsel {
>       struct perf_thread_map  *threads;
>       struct xyarray          *fd;
>       struct xyarray          *sample_id;
> +     struct xyarray          *output;
>       u64                     *id;
>       u32                      ids;
>  
> @@ -60,4 +61,7 @@ int perf_evsel__apply_filter(struct perf_evsel *evsel, 
> const char *filter);
>  int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads);
>  void perf_evsel__free_id(struct perf_evsel *evsel);
>  
> +int perf_evsel__alloc_output(struct perf_evsel *evsel, int ncpus, int 
> nthreads);
> +void perf_evsel__free_output(struct perf_evsel *evsel);

introducing these helpers together with the re-entrancy check
should go to separate patch

> +
>  #endif /* __LIBPERF_INTERNAL_EVSEL_H */
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index e108d90ae2ed..7dd7db4ff37a 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -837,6 +837,21 @@ static int record__mmap(struct record *rec)
>       return record__mmap_evlist(rec, rec->evlist);
>  }
>  
> +static int record__apply_filters(struct record *rec)
> +{
> +     struct evsel *pos;
> +     char msg[BUFSIZ];
> +
> +     if (perf_evlist__apply_filters(rec->evlist, &pos)) {
> +             pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
> +                    pos->filter, evsel__name(pos), errno,
> +                    str_error_r(errno, msg, sizeof(msg)));
> +             return -1;
> +     }
> +
> +     return 0;
> +}
> +
>  static int record__open(struct record *rec)
>  {
>       char msg[BUFSIZ];
> @@ -844,6 +859,7 @@ static int record__open(struct record *rec)
>       struct evlist *evlist = rec->evlist;
>       struct perf_session *session = rec->session;
>       struct record_opts *opts = &rec->opts;
> +     bool late_mmap = true;
>       int rc = 0;
>  
>       /*
> @@ -894,6 +910,21 @@ static int record__open(struct record *rec)
>               }
>  
>               pos->supported = true;
> +
> +             if (pos->allow_close) {
> +                     rc = record__mmap(rec);
> +                     if (rc)
> +                             goto out;
> +                     rc = record__apply_filters(rec);
> +                     if (rc)
> +                             goto out;
> +                     late_mmap = false;
> +             }

I was wondering why to call record__mmap for each evsel,
and why there are all those new re-entrancy checks..

because we want to close those file descriptors along the way,
not after all is open.. it could have been mentioned in some
comment ;-)

with all those re-entrancy checks could we call it just from
here for both allow_close and not allow_close cases and skip
the call below?

> +     }
> +
> +     evlist__for_each_entry(evlist, pos) {
> +             if (pos->core.output)
> +                     perf_evsel__free_output(&pos->core);
>       }
>  
>       if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
> @@ -907,18 +938,15 @@ static int record__open(struct record *rec)
>  "even with a suitable vmlinux or kallsyms file.\n\n");
>       }
>  
> -     if (perf_evlist__apply_filters(evlist, &pos)) {
> -             pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
> -                     pos->filter, evsel__name(pos), errno,
> -                     str_error_r(errno, msg, sizeof(msg)));
> -             rc = -1;
> -             goto out;
> +     if (late_mmap) {
> +             rc = record__mmap(rec);
> +             if (rc)
> +                     goto out;
> +             rc = record__apply_filters(rec);
> +             if (rc)
> +                     goto out;
>       }
>  
> -     rc = record__mmap(rec);
> -     if (rc)
> -             goto out;
> -
>       session->evlist = evlist;
>       perf_session__set_id_hdr_size(session);
>  out:
> diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
> index dc5c5e6fc502..2eac58ada5ab 100644
> --- a/tools/perf/util/cpumap.c
> +++ b/tools/perf/util/cpumap.c
> @@ -432,6 +432,10 @@ int cpu__setup_cpunode_map(void)
>       const char *mnt;
>       int n;
>  
> +     /* Only do this once */
> +     if (cpunode_map)
> +             return 0;
> +

this should go to separate patch

>       /* initialize globals */
>       if (init_cpunode_map())
>               return -1;
> diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
> index 173b4f0e0e6e..776a8339df99 100644
> --- a/tools/perf/util/evlist.c
> +++ b/tools/perf/util/evlist.c
> @@ -977,7 +977,7 @@ int perf_evlist__apply_filters(struct evlist *evlist, 
> struct evsel **err_evsel)
>       int err = 0;
>  
>       evlist__for_each_entry(evlist, evsel) {
> -             if (evsel->filter == NULL)
> +             if (evsel->filter == NULL || evsel->filter_applied)
>                       continue;
>  
>               /*
> @@ -989,6 +989,8 @@ int perf_evlist__apply_filters(struct evlist *evlist, 
> struct evsel **err_evsel)
>                       *err_evsel = evsel;
>                       break;
>               }
> +
> +             evsel->filter_applied = true;
>       }
>  
>       return err;
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 96e5171dce41..696c5d7190e2 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -1172,6 +1172,7 @@ int evsel__set_filter(struct evsel *evsel, const char 
> *filter)
>       if (new_filter != NULL) {
>               free(evsel->filter);
>               evsel->filter = new_filter;
> +             evsel->filter_applied = false;
>               return 0;
>       }
>  
> @@ -1632,6 +1633,9 @@ static int evsel__open_cpu(struct evsel *evsel, struct 
> perf_cpu_map *cpus,
>               pid = evsel->cgrp->fd;
>       }
>  
> +     if (evsel->leader != evsel)
> +             flags |= PERF_FLAG_ALLOW_CLOSE;
> +
>  fallback_missing_features:
>       if (perf_missing_features.clockid_wrong)
>               evsel->core.attr.clockid = CLOCK_MONOTONIC; /* should always 
> work */
> @@ -1641,6 +1645,8 @@ static int evsel__open_cpu(struct evsel *evsel, struct 
> perf_cpu_map *cpus,
>       }
>       if (perf_missing_features.cloexec)
>               flags &= ~(unsigned long)PERF_FLAG_FD_CLOEXEC;
> +     if (perf_missing_features.allow_close)
> +             flags &= ~(unsigned long)PERF_FLAG_ALLOW_CLOSE;
>       if (perf_missing_features.mmap2)
>               evsel->core.attr.mmap2 = 0;
>       if (perf_missing_features.exclude_guest)
> @@ -1729,6 +1735,11 @@ static int evsel__open_cpu(struct evsel *evsel, struct 
> perf_cpu_map *cpus,
>                               err = -EINVAL;
>                               goto out_close;
>                       }
> +
> +                     if (flags & PERF_FLAG_ALLOW_CLOSE) {
> +                             FD(evsel, cpu, thread) = -fd;
> +                             evsel->allow_close = true;

can't we move allow_close to perf_evsel and use that directly
instead of that -fd thing?

> +                     }
>               }
>       }
>  
> @@ -1766,7 +1777,11 @@ static int evsel__open_cpu(struct evsel *evsel, struct 
> perf_cpu_map *cpus,
>        * Must probe features in the order they were added to the
>        * perf_event_attr interface.
>        */
> -        if (!perf_missing_features.cgroup && evsel->core.attr.cgroup) {
> +     if (!perf_missing_features.allow_close && (flags & 
> PERF_FLAG_ALLOW_CLOSE)) {
> +             perf_missing_features.allow_close = true;
> +             pr_debug2("switching off ALLOW_CLOSE support\n");
> +             goto fallback_missing_features;
> +     } else if (!perf_missing_features.cgroup && evsel->core.attr.cgroup) {
>               perf_missing_features.cgroup = true;
>               pr_debug2_peo("Kernel has no cgroup sampling support, bailing 
> out\n");
>               goto out_close;
> diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> index 0f963c2a88a5..076a541bb274 100644
> --- a/tools/perf/util/evsel.h
> +++ b/tools/perf/util/evsel.h
> @@ -77,6 +77,8 @@ struct evsel {
>       bool                    forced_leader;
>       bool                    use_uncore_alias;
>       bool                    is_libpfm_event;
> +     bool                    filter_applied;

should go to separate patch

> +     bool                    allow_close;
>       /* parse modifier helper */
>       int                     exclude_GH;
>       int                     sample_read;
> @@ -130,6 +132,7 @@ struct perf_missing_features {
>       bool aux_output;
>       bool branch_hw_idx;
>       bool cgroup;
> +     bool allow_close;
>  };
>  
>  extern struct perf_missing_features perf_missing_features;
> -- 
> 2.27.0
> 

Reply via email to