Add support in ring buffer to allocate pages from persistent RAM
buffer. This feature supports switching to persistent memory and vice-versa.
A new option 'persist' has been added and once this is enabled, the pages in
ring buffer are freed up and new pages are allocated from persistent
memory.

Signed-off-by: Nachammai Karuppiah <nachukan...@gmail.com>
---
 kernel/trace/Kconfig       |  10 ++
 kernel/trace/ring_buffer.c | 257 ++++++++++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace.c       |  12 ++-
 kernel/trace/trace.h       |   3 +-
 4 files changed, 279 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a4020c0..f72a9df 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -739,6 +739,16 @@ config GCOV_PROFILE_FTRACE
          Note that on a kernel compiled with this config, ftrace will
          run significantly slower.
 
+config TRACE_EVENTS_TO_PSTORE
+       bool "Enable users to store trace records in persistent storage"
+       default n
+       help
+         This option enables users to store trace records in a
+         persistent RAM buffer so that they can be retrieved after
+         system reboot.
+
+         If unsure, say N.
+
 config FTRACE_SELFTEST
        bool
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f15471c..60b587a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -25,7 +25,7 @@
 #include <linux/list.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
-
+#include <linux/ramtrace.h>
 #include <asm/local.h>
 
 static void update_pages_handler(struct work_struct *work);
@@ -479,6 +479,9 @@ struct ring_buffer_per_cpu {
        struct completion               update_done;
 
        struct rb_irq_work              irq_work;
+#ifdef CONFIG_TRACE_EVENTS_TO_PSTORE
+       bool                            use_pstore;
+#endif
 };
 
 struct trace_buffer {
@@ -513,6 +516,15 @@ struct ring_buffer_iter {
        int                             missed_events;
 };
 
+#ifdef CONFIG_TRACE_EVENTS_TO_PSTORE
+/* This semaphore is being used to ensure that buffer_data_page memory
+ * is not switched to persistent storage or vice versa while a reader page
+ * is swapped out. All consuming reads need to be finished before memory
+ * switch happens.
+ */
+DECLARE_RWSEM(trace_read_sem);
+#endif
+
 /**
  * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
  * @buffer: The ring_buffer to get the number of pages from
@@ -1705,6 +1717,247 @@ static void update_pages_handler(struct work_struct 
*work)
        complete(&cpu_buffer->update_done);
 }
 
+#ifdef CONFIG_TRACE_EVENTS_TO_PSTORE
+static void free_buffer_data_page(struct buffer_data_page *page, int cpu,
+                                 bool persist)
+{
+       if (persist)
+               ramtrace_free_page(page, cpu);
+       else
+               free_page((unsigned long)page);
+
+}
+
+static int rb_allocate_persistent_pages(struct buffer_data_page **pages,
+                                       int nr_pages, int cpu)
+{
+       int i;
+
+       for (i = 0; i < nr_pages; i++) {
+               void *address = ramtrace_alloc_page(cpu);
+
+               if (!address)
+                       goto free_pages;
+               pages[i] = address;
+       }
+       return 0;
+
+free_pages:
+       for (i = 0; i < nr_pages; i++)
+               ramtrace_free_page(pages[i], cpu);
+
+       return -ENOMEM;
+}
+
+static int
+rb_allocate_buffer_data_pages(struct buffer_data_page **pages, int nr_pages,
+                             int cpu)
+{
+       bool user_thread = current->mm != NULL;
+       gfp_t mflags;
+       long i;
+
+       /*
+        * Check if the available memory is there first.
+        * Note, si_mem_available() only gives us a rough estimate of available
+        * memory. It may not be accurate. But we don't care, we just want
+        * to prevent doing any allocation when it is obvious that it is
+        * not going to succeed.
+        */
+       i = si_mem_available();
+       if (i < nr_pages)
+               return -ENOMEM;
+
+       /*
+        * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
+        * gracefully without invoking oom-killer and the system is not
+        * destabilized.
+        */
+       mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
+
+       /*
+        * If a user thread allocates too much, and si_mem_available()
+        * reports there's enough memory, even though there is not.
+        * Make sure the OOM killer kills this thread. This can happen
+        * even with RETRY_MAYFAIL because another task may be doing
+        * an allocation after this task has taken all memory.
+        * This is the task the OOM killer needs to take out during this
+        * loop, even if it was triggered by an allocation somewhere else.
+        */
+       if (user_thread)
+               set_current_oom_origin();
+       for (i = 0; i < nr_pages; i++) {
+               struct page *page;
+
+               page = alloc_pages_node(cpu_to_node(cpu), mflags, 0);
+               if (!page)
+                       goto free_pages;
+               pages[i] = page_address(page);
+               rb_init_page(pages[i]);
+
+               if (user_thread && fatal_signal_pending(current))
+                       goto free_pages;
+       }
+
+       if (user_thread)
+               clear_current_oom_origin();
+       return 0;
+free_pages:
+       for (i = 0; i < nr_pages; i++)
+               free_page((unsigned long)pages[i]);
+
+       return -ENOMEM;
+}
+
+static void rb_switch_memory(struct trace_buffer *buffer, bool persist)
+{
+       struct ring_buffer_per_cpu *cpu_buffer;
+       struct list_head *head;
+       struct buffer_page *bpage;
+       struct buffer_data_page ***new_pages;
+       unsigned long flags;
+       int cpu, nr_pages;
+
+       new_pages = kmalloc_array(buffer->cpus, sizeof(void *), GFP_KERNEL);
+
+       for_each_buffer_cpu(buffer, cpu) {
+               cpu_buffer = buffer->buffers[cpu];
+               nr_pages = cpu_buffer->nr_pages;
+               /* Include the reader page */
+               new_pages[cpu] = kmalloc_array(nr_pages + 1, sizeof(void *), 
GFP_KERNEL);
+               if (persist) {
+                       if (rb_allocate_persistent_pages(new_pages[cpu],
+                                               nr_pages + 1, cpu) < 0)
+                       goto out;
+               } else {
+                       if (rb_allocate_buffer_data_pages(new_pages[cpu],
+                                       nr_pages + 1, cpu) < 0)
+                               goto out;
+               }
+       }
+
+       for_each_buffer_cpu(buffer, cpu) {
+               int i = 0;
+
+               cpu_buffer = buffer->buffers[cpu];
+               nr_pages = cpu_buffer->nr_pages;
+               /* Acquire the reader lock to ensure reading is disabled.*/
+               raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+               if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
+                       goto out;
+               /* Prevent another thread from grabbing free_page. */
+               arch_spin_lock(&cpu_buffer->lock);
+
+               free_buffer_data_page(cpu_buffer->reader_page->page,
+                                     cpu, cpu_buffer->use_pstore);
+               cpu_buffer->reader_page->page = new_pages[cpu][i++];
+               rb_head_page_deactivate(cpu_buffer);
+
+               head = cpu_buffer->pages;
+               if (head) {
+                       list_for_each_entry(bpage, head, list) {
+                               free_buffer_data_page(bpage->page, cpu,
+                                                     cpu_buffer->use_pstore);
+                               bpage->page = new_pages[cpu][i++];
+                               rb_init_page(bpage->page);
+                       }
+                       bpage = list_entry(head, struct buffer_page, list);
+                       free_buffer_data_page(bpage->page, cpu,
+                                             cpu_buffer->use_pstore);
+                       bpage->page = new_pages[cpu][nr_pages];
+                       rb_init_page(bpage->page);
+               }
+               kfree(new_pages[cpu]);
+
+               if (cpu_buffer->free_page) {
+                       free_buffer_data_page(cpu_buffer->free_page, cpu,
+                                             cpu_buffer->use_pstore);
+                       cpu_buffer->free_page = 0;
+               }
+
+               cpu_buffer->use_pstore = persist;
+
+               rb_reset_cpu(cpu_buffer);
+               arch_spin_unlock(&cpu_buffer->lock);
+               raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+       }
+
+       kfree(new_pages);
+       return;
+out:
+       for_each_buffer_cpu(buffer, cpu) {
+               int i = 0;
+
+               cpu_buffer = buffer->buffers[cpu];
+               for (i = 0; i < cpu_buffer->nr_pages + 1; i++) {
+                       if (new_pages[cpu][i])
+                               free_buffer_data_page(new_pages[cpu][i], cpu,
+                                                     persist);
+               }
+               kfree(new_pages[cpu]);
+       }
+       kfree(new_pages);
+}
+
+void pstore_tracing_off(void);
+
+/**
+ * ring_buffer_switch_memory - If boolean argument 'persist' is true, switch
+ * to persistent memory and if false, switch to non persistent memory.
+ */
+int
+ring_buffer_switch_memory(struct trace_buffer *buffer, const char *tracer_name,
+                         int clock_id, bool persist)
+{
+       int cpu;
+       int online_cpu = 0;
+       int nr_pages_total = 0;
+
+       if (RB_WARN_ON(buffer, !down_write_trylock(&trace_read_sem)))
+               return -EBUSY;
+
+       if (persist) {
+               /* Quit if there is no reserved ramtrace region available */
+               if (!is_ramtrace_available())
+                       return -ENOMEM;
+
+               /* Disable pstore_trace buffers which are used for reading
+                * previous boot data pages.
+                */
+               pstore_tracing_off();
+
+               /* Estimate the number of pages needed. */
+               for_each_buffer_cpu(buffer, cpu) {
+                       online_cpu++;
+                       /* count the reader page as well */
+                       nr_pages_total += buffer->buffers[cpu]->nr_pages + 1;
+               }
+               /* Initialize ramtrace pages */
+               if (init_ramtrace_pages(online_cpu, nr_pages_total, 
tracer_name, clock_id))
+                       return -ENOMEM;
+       }
+
+
+       ring_buffer_record_disable(buffer);
+
+       /* Make sure all pending commits have finished */
+       synchronize_rcu();
+
+       /* prevent another thread from changing buffer sizes */
+       mutex_lock(&buffer->mutex);
+
+       rb_switch_memory(buffer, persist);
+
+       mutex_unlock(&buffer->mutex);
+
+       ring_buffer_record_enable(buffer);
+       up_write(&trace_read_sem);
+       return 0;
+
+}
+#endif
+
 /**
  * ring_buffer_resize - resize the ring buffer
  * @buffer: the buffer to resize.
@@ -4716,6 +4969,7 @@ void *ring_buffer_alloc_read_page(struct trace_buffer 
*buffer, int cpu)
 
  out:
        rb_init_page(bpage);
+       down_read(&trace_read_sem);
 
        return bpage;
 }
@@ -4753,6 +5007,7 @@ void ring_buffer_free_read_page(struct trace_buffer 
*buffer, int cpu, void *data
 
  out:
        free_page((unsigned long)bpage);
+       up_read(&trace_read_sem);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bb62269..2b3d8e9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -48,6 +48,7 @@
 #include <linux/fsnotify.h>
 #include <linux/irq_work.h>
 #include <linux/workqueue.h>
+#include <linux/ramtrace.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -265,7 +266,8 @@ unsigned long long ns2usecs(u64 nsec)
 
 /* trace_flags that are default zero for instances */
 #define ZEROED_TRACE_FLAGS \
-       (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK)
+       (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK |                 \
+        TRACE_ITER_PERSIST)
 
 /*
  * The global_trace is the descriptor that holds the top-level tracing
@@ -4851,6 +4853,14 @@ int set_tracer_flag(struct trace_array *tr, unsigned int 
mask, int enabled)
                trace_printk_control(enabled);
        }
 
+#ifdef CONFIG_TRACE_EVENTS_TO_PSTORE
+       if (mask == TRACE_ITER_PERSIST) {
+               ring_buffer_switch_memory(tr->array_buffer.buffer,
+                                         tr->current_trace->name,
+                                         tr->clock_id, enabled);
+       }
+#endif
+
        return 0;
 }
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 13db400..2a4ab72 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1336,7 +1336,8 @@ extern int trace_get_user(struct trace_parser *parser, 
const char __user *ubuf,
                FUNCTION_FLAGS                                  \
                FGRAPH_FLAGS                                    \
                STACK_FLAGS                                     \
-               BRANCH_FLAGS
+               BRANCH_FLAGS                                    \
+               C(PERSIST,              "persist"),
 
 /*
  * By defining C, we can make TRACE_FLAGS a list of bit names
-- 
2.7.4

Reply via email to