[gclist] synchronization cost (was: Garbage collection and XM L)

Boehm, Hans hans_boehm@hp.com
Fri, 9 Mar 2001 17:36:32 -0800


I just tried this on a 500 MHz Pentium III.  I get about 23 cycles for

lock; cmpxchg

and about 19 or 20 cycles for xchg (which has an implicit lock prefix).

I got consistent results by timing a loop and by looking at an instruction
level profile.  Putting other stuff in the loop didn't seem to affect the
time taken by xchg much.  Here's the code in case someone else wants to try.
(This requires Linux/gcc)

(Compile with gcc -static -O -DPROF swap.c prof.c to get profile.)

swap.c:
--------------------------------------------
#include <stdio.h>

typedef int GC_bool;

         inline static GC_bool GC_test_and_set(volatile unsigned *addr)
         {
	   int oldval;
	  /* Note: the "xchg" instruction does not need a "lock" prefix */
	  __asm__ __volatile__("xchgl %0, %1"
		: "=r"(oldval), "=m"(*(addr))
		: "0"(1), "m"(*(addr)) : "memory");
	  return oldval;
         }

volatile unsigned lock;

int main()
{
    int i;
#   ifdef PROF
	init_profiling();
#   endif
    for (i = 0; i < 10000000; ++i) {
        int result;
	result = GC_test_and_set(&lock);
	lock = 0;
	if (result) printf("Failed\n");
    }
#   ifdef PROF
	dump_profile();
#   endif
    return 0;
}
----------------------------------------------------

prof.c:
----------------------------------------------------

#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>

/* A very simple profiler.  Note that it should be possible to	*/
/* get function level information by concatenating this with nm	*/
/* output and running the result through the sort utility.	*/
/* This assumes that all interesting parts of the executable	*/
/* are statically linked.					*/

static size_t buf_size;
static u_short *profil_buf;

# ifdef __i386__
#   ifndef COMPRESSION
#     define COMPRESSION 1
#   endif
#   define TEXT_START 0x08000000
#   define PTR_DIGS 8
# endif
# ifdef __ia64__
#   ifndef COMPRESSION
#     define COMPRESSION 8
#   endif
#   define TEXT_START 0x4000000000000000
#   define PTR_DIGS 16 
# endif

extern int etext;

/*
 * Note that the ith entry in the profile buffer corresponds to
 * a PC value of TEXT_START + i * COMPRESSION * 2.
 * The extra factor of 2 is not apparent from the documentation,
 * but it is explicit in the glibc source.
 */

void init_profiling()
{
    buf_size = ((size_t)(&etext) - TEXT_START + 0x10)/COMPRESSION/2;
    profil_buf = calloc(buf_size, sizeof(u_short));
    if (profil_buf == 0) {
	fprintf(stderr, "Could not allocate profile buffer\n");
    }
    profil(profil_buf, buf_size * sizeof(u_short),
	   TEXT_START, 65536/COMPRESSION);
}

void dump_profile()
{
    size_t i;
    size_t sum = 0;
    for (i = 0; i < buf_size; ++i) {
	if (profil_buf[i] != 0) {
	    fprintf(stderr, "%0*lx\t%d !PROF!\n",
		    PTR_DIGS,
		    TEXT_START + i * COMPRESSION * 2,
		    profil_buf[i]);
	    sum += profil_buf[i];
	}
    }
    fprintf(stderr, "Total number of samples was %ld !PROF!\n", sum);
}

-------------------------------------------------


> -----Original Message-----
> From: Ji-Yong D. Chung [mailto:virtualcyber@erols.com]
> Sent: Friday, March 09, 2001 4:51 PM
> To: gclist@iecc.com
> Subject: Re: [gclist] synchronization cost (was: Garbage 
> collection and
> XML)
> 
> 
>     And I forgot to mention that EnterCriticalSection takes 
> (I have read)
> about 6 CPU cycles in optimal case.
> 
>     I have seen implementations of non-reentrant spin locks
> that  take a 5 cycles per lock (implemented using
> LOCK and XCHG and MOV.  LOCK takes 1 CPU cycle, XCHG takes
> 3 CPU cycles and MOV takes 1 cycle).
> 
>