[PATCH v3] libgfortran: Replace mutex with rwlock

2023-04-19 Thread Lipeng Zhu via Fortran
This patch try to introduce the rwlock and split the read/write to
unit_root tree and unit_cache with rwlock instead of the mutex to
increase CPU efficiency. In the get_gfc_unit function, the percentage
to step into the insert_unit function is around 30%, in most instances,
we can get the unit in the phase of reading the unit_cache or unit_root
tree. So split the read/write phase by rwlock would be an approach to
make it more parallel.

BTW, the IPC metrics can gain around 9x in our test
server with 220 cores. The benchmark we used is
https://github.com/rwesson/NEAT

libgcc/ChangeLog:

* gthr-posix.h (__GTHREAD_RWLOCK_INIT): New macro
(__gthrw): New function
(__gthread_rwlock_rdlock): New function
(__gthread_rwlock_tryrdlock): New function
(__gthread_rwlock_wrlock): New function
(__gthread_rwlock_trywrlock): New function
(__gthread_rwlock_unlock): New function

libgfortran/ChangeLog:

* io/async.c (DEBUG_LINE): New
* io/async.h (RWLOCK_DEBUG_ADD): New macro
(CHECK_RDLOCK): New macro
(CHECK_WRLOCK): New macro
(TAIL_RWLOCK_DEBUG_QUEUE): New macro
(IN_RWLOCK_DEBUG_QUEUE): New macro
(RDLOCK): New macro
(WRLOCK): New macro
(RWUNLOCK): New macro
(RD_TO_WRLOCK): New macro
(INTERN_RDLOCK): New macro
(INTERN_WRLOCK): New macro
(INTERN_RWUNLOCK): New macro
* io/io.h (internal_proto): Define unit_rwlock
* io/transfer.c (st_read_done_worker): Relace unit_lock with unit_rwlock
(st_write_done_worker): Relace unit_lock with unit_rwlock
* io/unit.c (get_gfc_unit): Relace unit_lock with unit_rwlock
(if): Relace unit_lock with unit_rwlock
(close_unit_1): Relace unit_lock with unit_rwlock
(close_units): Relace unit_lock with unit_rwlock
(newunit_alloc): Relace unit_lock with unit_rwlock
* io/unix.c (flush_all_units): Relace unit_lock with unit_rwlock

---
v1 -> v2:
Limit the pthread_rwlock usage in libgcc only when __cplusplus isn't defined.

v2 -> v3:
Rebase the patch with trunk branch.

Signed-off-by: Lipeng Zhu 
---
 libgcc/gthr-posix.h   |  60 +++
 libgfortran/io/async.c|   4 +
 libgfortran/io/async.h| 151 ++
 libgfortran/io/io.h   |  15 ++--
 libgfortran/io/transfer.c |   8 +-
 libgfortran/io/unit.c |  65 
 libgfortran/io/unix.c |  16 ++--
 7 files changed, 273 insertions(+), 46 deletions(-)

diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h
index aebcfdd9f4c..73283082997 100644
--- a/libgcc/gthr-posix.h
+++ b/libgcc/gthr-posix.h
@@ -48,6 +48,9 @@ typedef pthread_t __gthread_t;
 typedef pthread_key_t __gthread_key_t;
 typedef pthread_once_t __gthread_once_t;
 typedef pthread_mutex_t __gthread_mutex_t;
+#ifndef __cplusplus
+typedef pthread_rwlock_t __gthread_rwlock_t;
+#endif
 typedef pthread_mutex_t __gthread_recursive_mutex_t;
 typedef pthread_cond_t __gthread_cond_t;
 typedef struct timespec __gthread_time_t;
@@ -58,6 +61,9 @@ typedef struct timespec __gthread_time_t;
 
 #define __GTHREAD_MUTEX_INIT PTHREAD_MUTEX_INITIALIZER
 #define __GTHREAD_MUTEX_INIT_FUNCTION __gthread_mutex_init_function
+#ifndef __cplusplus
+#define __GTHREAD_RWLOCK_INIT PTHREAD_RWLOCK_INITIALIZER
+#endif
 #define __GTHREAD_ONCE_INIT PTHREAD_ONCE_INIT
 #if defined(PTHREAD_RECURSIVE_MUTEX_INITIALIZER)
 #define __GTHREAD_RECURSIVE_MUTEX_INIT PTHREAD_RECURSIVE_MUTEX_INITIALIZER
@@ -135,6 +141,13 @@ __gthrw(pthread_mutexattr_init)
 __gthrw(pthread_mutexattr_settype)
 __gthrw(pthread_mutexattr_destroy)
 
+#ifndef __cplusplus
+__gthrw(pthread_rwlock_rdlock)
+__gthrw(pthread_rwlock_tryrdlock)
+__gthrw(pthread_rwlock_wrlock)
+__gthrw(pthread_rwlock_trywrlock)
+__gthrw(pthread_rwlock_unlock)
+#endif
 
 #if defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)
 /* Objective-C.  */
@@ -885,6 +898,53 @@ __gthread_cond_destroy (__gthread_cond_t* __cond)
   return __gthrw_(pthread_cond_destroy) (__cond);
 }
 
+#ifndef __cplusplus
+static inline int
+__gthread_rwlock_rdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_rdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_tryrdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_tryrdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_wrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_wrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_trywrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_trywrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_unlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_unlock) (__rwlock);
+  else
+return 0;
+}
+#endif
+
 #endif /* _LIBOBJC */
 
 #endif /* ! GCC_GTHR_POSIX_H */
diff --git a/lib

Re: [PATCH v3] libgfortran: Replace mutex with rwlock

2023-04-19 Thread Bernhard Reutner-Fischer via Fortran
On 19 April 2023 09:06:28 CEST, Lipeng Zhu via Fortran  
wrote:

>+#ifdef __GTHREAD_RWLOCK_INIT
>+#define RWLOCK_DEBUG_ADD(rwlock) do { \
>+aio_rwlock_debug *n;  \
>+n = malloc (sizeof(aio_rwlock_debug));\

My malloc can fail.

>+n->prev = TAIL_RWLOCK_DEBUG_QUEUE;\
>+if (n->prev)  \
>+  n->prev->next = n;  \
>+n->next = NULL;   \
>+n->line = __LINE__;   \
>+n->func = __FUNCTION__;   \
>+n->rw = rwlock;   \
>+if (!aio_rwlock_debug_head) { \
>+  aio_rwlock_debug_head = n;  \
>+} \
>+  } while (0)
>+



Re: [PATCH v3] libgfortran: Replace mutex with rwlock

2023-04-19 Thread Bernhard Reutner-Fischer via Fortran
On Wed, 19 Apr 2023 at 14:51, Bernhard Reutner-Fischer
 wrote:
>
> On 19 April 2023 09:06:28 CEST, Lipeng Zhu via Fortran  
> wrote:
>
> >+#ifdef __GTHREAD_RWLOCK_INIT
> >+#define RWLOCK_DEBUG_ADD(rwlock) do { \
> >+aio_rwlock_debug *n;  \
> >+n = malloc (sizeof(aio_rwlock_debug));\
>
> My malloc can fail.

Sorry, i hit send too early.
Please use xmalloc as defined in libgfortran/runtime/memory.c

PS: IIRC we have likely() / unlikely() macros in libgfortran, so you
may want to check if you want to annotate some conditions accordingly
if predict gets it wrong.
thanks,
>
> >+n->prev = TAIL_RWLOCK_DEBUG_QUEUE;\
> >+if (n->prev)  \
> >+  n->prev->next = n;  \
> >+n->next = NULL;   \
> >+n->line = __LINE__;   \
> >+n->func = __FUNCTION__;   \
> >+n->rw = rwlock;   \
> >+if (!aio_rwlock_debug_head) { \
> >+  aio_rwlock_debug_head = n;  \
> >+} \
> >+  } while (0)
> >+
>


Re: [PATCH] testsuite: fix scan-tree-dump patterns [PR83904, PR100297]

2023-04-19 Thread Bernhard Reutner-Fischer via Fortran
On Wed, 19 Apr 2023 at 03:03, Jerry D via Fortran  wrote:
>
> On 4/18/23 12:39 PM, Harald Anlauf via Fortran wrote:
> > Dear all,
> >
> > the attached patch adjusts the scan-tree-dump patterns of the
> > reported testcases which likely were run in a location such
> > that a path in an error message showing in the tree-dump might
> > have accidentally matched "free" or "data", respectively.
> >
> > For the testcase gfortran.dg/reshape_8.f90 I checked with a
> > failing gfortran-11 that the pattern is appropriate.
> >
> > OK for mainline?
> >
> > Thanks,
> > Harald
> >
> Yes, OK

I'm certainly not opposed to this specific incarnation of such a fix.
These failures are really unpleasant :)
As proposed in 
https://inbox.sourceware.org/gcc-patches/20220426010029.2b476337@nbbrfq/
we could add a -fno-file to suppress the assembler .file output
(whatever the prefix looks like depends on the assembler dialect). Or
we could nuke the .file directives by a sed(1), but that would
probably be cumbersome for remote targets. I don't have a better idea
than -fno-file or -ffile=foo.c .
Fixing them case-by-case does not scale all that well IMHO.

Thoughts?


Re: [PATCH] testsuite: fix scan-tree-dump patterns [PR83904, PR100297]

2023-04-19 Thread Harald Anlauf via Fortran

On 4/19/23 17:14, Bernhard Reutner-Fischer via Gcc-patches wrote:

On Wed, 19 Apr 2023 at 03:03, Jerry D via Fortran  wrote:


On 4/18/23 12:39 PM, Harald Anlauf via Fortran wrote:

Dear all,

the attached patch adjusts the scan-tree-dump patterns of the
reported testcases which likely were run in a location such
that a path in an error message showing in the tree-dump might
have accidentally matched "free" or "data", respectively.

For the testcase gfortran.dg/reshape_8.f90 I checked with a
failing gfortran-11 that the pattern is appropriate.

OK for mainline?

Thanks,
Harald


Yes, OK


I'm certainly not opposed to this specific incarnation of such a fix.
These failures are really unpleasant :)
As proposed in 
https://inbox.sourceware.org/gcc-patches/20220426010029.2b476337@nbbrfq/
we could add a -fno-file to suppress the assembler .file output
(whatever the prefix looks like depends on the assembler dialect). Or
we could nuke the .file directives by a sed(1), but that would
probably be cumbersome for remote targets. I don't have a better idea
than -fno-file or -ffile=foo.c .
Fixing them case-by-case does not scale all that well IMHO.

Thoughts?



?

It wasn't the tree-dumps being at fault, it was the scan patterns.



Re: [PATCH v3] libgfortran: Replace mutex with rwlock

2023-04-19 Thread Bernhard Reutner-Fischer via Fortran
On 19 April 2023 09:06:28 CEST, Lipeng Zhu via Fortran  
wrote:
>This patch try to introduce the rwlock and split the read/write to
>unit_root tree and unit_cache with rwlock instead of the mutex to
>increase CPU efficiency. In the get_gfc_unit function, the percentage
>to step into the insert_unit function is around 30%, in most instances,
>we can get the unit in the phase of reading the unit_cache or unit_root
>tree. So split the read/write phase by rwlock would be an approach to
>make it more parallel.
>
>BTW, the IPC metrics can gain around 9x in our test
>server with 220 cores. The benchmark we used is
>https://github.com/rwesson/NEAT
>

>+#define RD_TO_WRLOCK(rwlock) \
>+  RWUNLOCK (rwlock);\
>+  WRLOCK (rwlock);
>+#endif
>+


>diff --git a/libgfortran/io/unit.c b/libgfortran/io/unit.c
>index 82664dc5f98..4312c5f36de 100644
>--- a/libgfortran/io/unit.c
>+++ b/libgfortran/io/unit.c

>@@ -329,7 +335,7 @@ get_gfc_unit (int n, int do_create)
>   int c, created = 0;
> 
>   NOTE ("Unit n=%d, do_create = %d", n, do_create);
>-  LOCK (&unit_lock);
>+  RDLOCK (&unit_rwlock);
> 
> retry:
>   for (c = 0; c < CACHE_SIZE; c++)
>@@ -350,6 +356,7 @@ retry:
>   if (c == 0)
>   break;
> }
>+  RD_TO_WRLOCK (&unit_rwlock);

So I'm trying to convince myself why it's safe to unlock and only then take the 
write lock.

Can you please elaborate/confirm why that's ok?

I wouldn't mind a comment like
We can release the unit and cache read lock now. We might have to allocate a 
(locked) unit, below in do_create.
Either way, we will manipulate the cache and/or the unit list so we have to 
take a write lock now.

We don't take the write bit in *addition* to the read lock because..

(that needlessly complicates releasing the respective locks / it triggers too 
much contention when we.. / ...?)

thanks,

> 
>   if (p == NULL && do_create)
> {
>@@ -368,8 +375,8 @@ retry:
>   if (created)
> {
>   /* Newly created units have their lock held already
>-   from insert_unit.  Just unlock UNIT_LOCK and return.  */
>-  UNLOCK (&unit_lock);
>+   from insert_unit.  Just unlock UNIT_RWLOCK and return.  */
>+  RWUNLOCK (&unit_rwlock);
>   return p;
> }
> 
>@@ -380,7 +387,7 @@ found:
>   if (! TRYLOCK (&p->lock))
>   {
> /* assert (p->closed == 0); */
>-UNLOCK (&unit_lock);
>+RWUNLOCK (&unit_rwlock);
> return p;
>   }
> 
>@@ -388,14 +395,14 @@ found:
> }
> 
> 
>-  UNLOCK (&unit_lock);
>+  RWUNLOCK (&unit_rwlock);
> 
>   if (p != NULL && (p->child_dtio == 0))
> {
>   LOCK (&p->lock);
>   if (p->closed)
>   {
>-LOCK (&unit_lock);
>+WRLOCK (&unit_rwlock);
> UNLOCK (&p->lock);
> if (predec_waiting_locked (p) == 0)
>   destroy_unit_mutex (p);
>@@ -593,8 +600,8 @@ init_units (void)
> #endif
> #endif
> 
>-#ifndef __GTHREAD_MUTEX_INIT
>-  __GTHREAD_MUTEX_INIT_FUNCTION (&unit_lock);
>+#if (!defined(__GTHREAD_RWLOCK_INIT) && !defined(__GTHREAD_MUTEX_INIT))
>+  __GTHREAD_MUTEX_INIT_FUNCTION (&unit_rwlock);
> #endif
> 
>   if (sizeof (max_offset) == 8)