From ilya Thu Mar 12 22:09:46 1998
Subject: Re: "newer" malloc [PATCH 5.004_62]
To: lukka@fas.harvard.edu (Tuomas Lukka)
Date: Thu, 12 Mar 1998 22:09:46 -0500 (EST)
Cc: koenig@kulturbox.de (Andreas J. Koenig),
	perl5-porters@perl.org (Mailing list Perl5)
In-Reply-To: <Pine.OSF.3.96.980312091901.16862B-100000@login5.fas.harvard.edu> from "Tuomas Lukka" at Mar 12, 1998 09:19:26 AM
X-Mailer: ELM [version 2.5 PL0b1]
Content-Length: 46996     
Status: OR

Tuomas Lukka writes:
> 
> On 12 Mar 1998, Andreas J. Koenig wrote:
> 
> > >>>>> On Wed, 11 Mar 1998 23:06:13 -0500 (EST), Ilya Zakharevich <ilya@math.ohio-state.edu> said:
> > 
> > ilya> If there is any interest in merging it, I may try to build _62 and
> > ilya> merge with the changes 56--->62,
> > 
> > Yes, I'm in favor of malloc patches. Personally I'm quite unable to
> > keep track of your patches and only use them occasionally. But I
> > believe, I could really benefit from them.
> 
> Another vote in favor...

OK, OK, the benchmark results and the patch is below.  Note that the
slightly slower find-bucket is competing with significantly-improved
sbrk()-algorithm (and lower memory footprint?) in the examples below.
Otherwise I cannot explain why the results are varying so widely...

> Of course, I'm also dreaming of lvalue subs...

Well, if malloc patch goes in (hint hint), and if subroutines
attributes are allowed now (are they?) I may even do it some time
soon...  The older way is frowned upon now, so without working sub
attributes I see no way to redo this patch...

Ilya

A) perl-5.00462
        path        = ../perl5.004_62/perl
        cc          = cc
        optimize    = -O
        ccflags     = -I/opt/local/include -I/opt/gnu/include
        usemymalloc = y

B) perl-5.00462
        path        = ../perl5.004_62/perl-mymalloc
        cc          = cc
        optimize    = -O
        ccflags     = -I/opt/local/include -I/opt/gnu/include
        usemymalloc = y

                           A       B
                        ----    ----
arith/mixed              100     121
arith/trig               100      99
array/copy               100      98
array/foreach            100      96
array/index              100      87
array/pop                100      98
array/shift              100     100
call/0arg-noamp          100      93
call/0arg                100      97
call/1arg                100     104
call/2arg                100     103
call/9arg                100     101
call/empty               100      88
call/fib                 100     103
call/method              100     101
call/wantarray           100     103
hash/copy                100      89
hash/each                100      89
hash/foreach-sort        100      82
hash/foreach             100     101
hash/get                 100     111
hash/set                 100     103
loop/for-c               100     103
loop/for-interval        100     106
loop/getline             100      96
loop/while-my            100     113
loop/while               100     107
re/const                 100     102
re/w                     100      96
startup/fewmod           100     108
startup/lotsofsub        100     111
startup/noprog           100     101
string/base64            100      97
string/htmlparser        100      98
string/index-const       100      98
string/index-var         100      92
string/ipol              100      86

AVERAGE                  100      99


--- ./malloc.c	Thu Mar 12 21:00:44 1998
+++ ./malloc.c-mymalloc	Thu Mar 12 21:00:03 1998
@@ -2,14 +2,52 @@
  *
  */
 
-#if defined(PERL_CORE) && !defined(DEBUGGING_MSTATS)
-#  define DEBUGGING_MSTATS
+#ifndef NO_FANCY_MALLOC
+#  ifndef SMALL_BUCKET_VIA_TABLE
+#    define SMALL_BUCKET_VIA_TABLE
+#  endif 
+#  ifndef BUCKETS_ROOT2
+#    define BUCKETS_ROOT2
+#  endif 
+#  ifndef IGNORE_SMALL_BAD_FREE
+#    define IGNORE_SMALL_BAD_FREE
+#  endif 
 #endif 
 
+#ifndef PLAIN_MALLOC			/* Bulk enable features */
+#  ifndef PACK_MALLOC
+#      define PACK_MALLOC
+#  endif 
+#  ifndef TWO_POT_OPTIMIZE
+#    define TWO_POT_OPTIMIZE
+#  endif 
+#  if defined(PERL_CORE) && !defined(EMERGENCY_SBRK)
+#    define EMERGENCY_SBRK
+#  endif 
+#  if defined(PERL_CORE) && !defined(DEBUGGING_MSTATS)
+#    define DEBUGGING_MSTATS
+#  endif 
+#endif
+
+#define MIN_BUC_POW2 (sizeof(void*) > 4 ? 3 : 2) /* Allow for 4-byte arena. */
+#define MIN_BUCKET (MIN_BUC_POW2 * BUCKETS_PER_POW2)
+
+#if !(defined(I286) || defined(atarist))
+	/* take 2k unless the block is bigger than that */
+#  define LOG_OF_MIN_ARENA 11
+#else
+	/* take 16k unless the block is bigger than that 
+	   (80286s like large segments!), probably good on the atari too */
+#  define LOG_OF_MIN_ARENA 14
+#endif
+
 #ifndef lint
 #  if defined(DEBUGGING) && !defined(NO_RCHECK)
 #    define RCHECK
 #  endif
+#  if defined(RCHECK) && defined(IGNORE_SMALL_BAD_FREE)
+#    undef IGNORE_SMALL_BAD_FREE
+#  endif 
 /*
  * malloc.c (Caltech) 2/21/82
  * Chris Kingsley, kingsley@cit-20.
@@ -26,23 +64,50 @@
 #include "EXTERN.h"
 #include "perl.h"
 
+#ifndef PERL_CORE
+#  ifndef croak				/* make depend */
+#    define croak(mess) fprintf(stderr,mess); exit(1);
+#  endif 
+#  ifdef DEBUG_m
+#    undef DEBUG_m
+#  endif 
+#  define DEBUG_m(a)
+#  ifdef DEBUGGING
+#     undef DEBUGGING
+#  endif
+#endif
+
+#ifndef MUTEX_LOCK
+#  define MUTEX_LOCK(l)
+#endif 
+
+#ifndef MUTEX_UNLOCK
+#  define MUTEX_UNLOCK(l)
+#endif 
+
 #ifdef DEBUGGING
-#undef DEBUG_m
-#define DEBUG_m(a)  if (debug & 128)   a
+#  undef DEBUG_m
+#  define DEBUG_m(a)  if (debug & 128)   a
 #endif
 
 /* I don't much care whether these are defined in sys/types.h--LAW */
 
 #define u_char unsigned char
 #define u_int unsigned int
+
+#ifdef HAS_QUAD
+#  define u_bigint UV			/* Needs to eat *void. */
+#else  /* needed? */
+#  define u_bigint unsigned long	/* Needs to eat *void. */
+#endif
+
 #define u_short unsigned short
 
 /* 286 and atarist like big chunks, which gives too much overhead. */
 #if (defined(RCHECK) || defined(I286) || defined(atarist)) && defined(PACK_MALLOC)
-#undef PACK_MALLOC
+#  undef PACK_MALLOC
 #endif 
 
-
 /*
  * The description below is applicable if PACK_MALLOC is not defined.
  *
@@ -81,72 +146,205 @@ static int findbucket _((union overhead 
 
 #define	MAGIC		0xff		/* magic # on accounting info */
 #define RMAGIC		0x55555555	/* magic # on range info */
+#define RMAGIC_C	0x55		/* magic # on range info */
+
 #ifdef RCHECK
 #  define	RSLOP		sizeof (u_int)
 #  ifdef TWO_POT_OPTIMIZE
-#    define MAX_SHORT_BUCKET 12
+#    define MAX_SHORT_BUCKET (12 * BUCKETS_PER_POW2)
 #  else
-#    define MAX_SHORT_BUCKET 13
+#    define MAX_SHORT_BUCKET (13 * BUCKETS_PER_POW2)
 #  endif 
 #else
 #  define	RSLOP		0
 #endif
 
+#if !defined(PACK_MALLOC) && defined(BUCKETS_ROOT2)
+#  undef BUCKETS_ROOT2
+#endif 
+
+#ifdef BUCKETS_ROOT2
+#  define BUCKET_TABLE_SHIFT 2
+#  define BUCKET_POW2_SHIFT 1
+#  define BUCKETS_PER_POW2 2
+#else
+#  define BUCKET_TABLE_SHIFT MIN_BUC_POW2
+#  define BUCKET_POW2_SHIFT 0
+#  define BUCKETS_PER_POW2 1
+#endif 
+
+#ifdef BUCKETS_ROOT2
+#  define MAX_BUCKET_BY_TABLE 13
+static u_short buck_size[MAX_BUCKET_BY_TABLE + 1] = 
+  { 
+      0, 0, 0, 0, 4, 4, 8, 12, 16, 24, 32, 48, 64, 80,
+  };
+#  define BUCKET_SIZE(i) ((i) % 2 ? buck_size[i] : (1 << ((i) >> BUCKET_POW2_SHIFT)))
+#  define BUCKET_SIZE_REAL(i) ((i) <= MAX_BUCKET_BY_TABLE		\
+			       ? buck_size[i] 				\
+			       : ((1 << ((i) >> BUCKET_POW2_SHIFT))	\
+				  - MEM_OVERHEAD(i)			\
+				  + POW2_OPTIMIZE_SURPLUS(i)))
+#else
+#  define BUCKET_SIZE(i) (1 << ((i) >> BUCKET_POW2_SHIFT))
+#  define BUCKET_SIZE_REAL(i) (BUCKET_SIZE(i) - MEM_OVERHEAD(i) + POW2_OPTIMIZE_SURPLUS(i))
+#endif 
+
+
 #ifdef PACK_MALLOC
-/*
- * In this case it is assumed that if we do sbrk() in 2K units, we
- * will get 2K aligned blocks. The bucket number of the given subblock is
- * on the boundary of 2K block which contains the subblock.
- * Several following bytes contain the magic numbers for the subblocks
- * in the block.
+/* In this case it is assumed that if we do sbrk() in 2K units, we
+ * will get 2K aligned arenas (at least after some initial
+ * alignment). The bucket number of the given subblock is on the start
+ * of 2K arena which contains the subblock.  Several following bytes
+ * contain the magic numbers for the subblocks in the block.
  *
  * Sizes of chunks are powers of 2 for chunks in buckets <=
  * MAX_PACKED, after this they are (2^n - sizeof(union overhead)) (to
  * get alignment right).
  *
- * We suppose that starts of all the chunks in a 2K block are in
- * different 2^n-byte-long chunks.  If the top of the last chunk is
- * aligned on a boundary of 2K block, this means that
- * sizeof(union overhead)*"number of chunks" < 2^n, or
- * sizeof(union overhead)*2K < 4^n, or n > 6 + log2(sizeof()/2)/2, if a
- * chunk of size 2^n - overhead is used. Since this rules out n = 7
- * for 8 byte alignment, we specialcase allocation of the first of 16
- * 128-byte-long chunks.
+ * Consider an arena for 2^n with n>MAX_PACKED.  We suppose that
+ * starts of all the chunks in a 2K arena are in different
+ * 2^n-byte-long chunks.  If the top of the last chunk is aligned on a
+ * boundary of 2K block, this means that sizeof(union
+ * overhead)*"number of chunks" < 2^n, or sizeof(union overhead)*2K <
+ * 4^n, or n > 6 + log2(sizeof()/2)/2, since a chunk of size 2^n -
+ * overhead is used.  Since this rules out n = 7 for 8 byte alignment,
+ * we specialcase allocation of the first of 16 128-byte-long chunks.
  *
  * Note that with the above assumption we automatically have enough
  * place for MAGIC at the start of 2K block.  Note also that we
- * overlay union overhead over the chunk, thus the start of the chunk
- * is immediately overwritten after freeing.
- */
-#  define MAX_PACKED 6
-#  define MAX_2_POT_ALGO ((1<<(MAX_PACKED + 1)) - M_OVERHEAD)
-#  define TWOK_MASK ((1<<11) - 1)
-#  define TWOK_MASKED(x) ((u_int)(x) & ~TWOK_MASK)
-#  define TWOK_SHIFT(x) ((u_int)(x) & TWOK_MASK)
+ * overlay union overhead over the chunk, thus the start of small chunks
+ * is immediately overwritten after freeing.  */
+#  define MAX_PACKED_POW2 6
+#  define MAX_PACKED (MAX_PACKED_POW2 * BUCKETS_PER_POW2 + BUCKET_POW2_SHIFT)
+#  define MAX_POW2_ALGO ((1<<(MAX_PACKED_POW2 + 1)) - M_OVERHEAD)
+#  define TWOK_MASK ((1<<LOG_OF_MIN_ARENA) - 1)
+#  define TWOK_MASKED(x) ((u_bigint)(x) & ~TWOK_MASK)
+#  define TWOK_SHIFT(x) ((u_bigint)(x) & TWOK_MASK)
 #  define OV_INDEXp(block) ((u_char*)(TWOK_MASKED(block)))
 #  define OV_INDEX(block) (*OV_INDEXp(block))
 #  define OV_MAGIC(block,bucket) (*(OV_INDEXp(block) +			\
-				    (TWOK_SHIFT(block)>>(bucket + 3)) + \
-				    (bucket > MAX_NONSHIFT ? 1 : 0)))
+				    (TWOK_SHIFT(block)>>		\
+				     (bucket>>BUCKET_POW2_SHIFT)) +	\
+				    (bucket >= MIN_NEEDS_SHIFT ? 1 : 0)))
+    /* A bucket can have a shift smaller than it size, we need to
+       shift its magic number so it will not overwrite index: */
+#  ifdef BUCKETS_ROOT2
+#    define MIN_NEEDS_SHIFT (7*BUCKETS_PER_POW2 - 1) /* Shift 80 greater than chunk 64. */
+#  else
+#    define MIN_NEEDS_SHIFT (7*BUCKETS_PER_POW2) /* Shift 128 greater than chunk 32. */
+#  endif 
 #  define CHUNK_SHIFT 0
 
-static u_char n_blks[11 - 3]	 = {224, 120, 62, 31, 16, 8, 4, 2};
-static u_short blk_shift[11 - 3] = {256, 128, 64, 32, 
-				    16*sizeof(union overhead), 
-				    8*sizeof(union overhead), 
-				    4*sizeof(union overhead), 
-				    2*sizeof(union overhead), 
-#  define MAX_NONSHIFT 2	/* Shift 64 greater than chunk 32. */
-};
+/* Number of active buckets of given ordinal. */
+#ifdef IGNORE_SMALL_BAD_FREE
+#define FIRST_BUCKET_WITH_CHECK (6 * BUCKETS_PER_POW2) /* 64 */
+#  define N_BLKS(bucket) ( (bucket) < FIRST_BUCKET_WITH_CHECK 		\
+			 ? ((1<<LOG_OF_MIN_ARENA) - 1)/BUCKET_SIZE(bucket) \
+			 : n_blks[bucket] )
+#else
+#  define N_BLKS(bucket) n_blks[bucket]
+#endif 
+
+static u_short n_blks[LOG_OF_MIN_ARENA * BUCKETS_PER_POW2] = 
+  {
+#  if BUCKETS_PER_POW2==1
+      0, 0,
+      (MIN_BUC_POW2==2 ? 384 : 0),
+      224, 120, 62, 31, 16, 8, 4, 2
+#  else
+      0, 0, 0, 0,
+      (MIN_BUC_POW2==2 ? 384 : 0), (MIN_BUC_POW2==2 ? 384 : 0),	/* 4, 4 */
+      224, 149, 120, 80, 62, 41, 31, 25, 16, 16, 8, 8, 4, 4, 2, 2
+#  endif
+  };
+
+/* Shift of the first bucket with the given ordinal inside 2K chunk. */
+#ifdef IGNORE_SMALL_BAD_FREE
+#  define BLK_SHIFT(bucket) ( (bucket) < FIRST_BUCKET_WITH_CHECK 	\
+			      ? ((1<<LOG_OF_MIN_ARENA)			\
+				 - BUCKET_SIZE(bucket) * N_BLKS(bucket)) \
+			      : blk_shift[bucket])
+#else
+#  define BLK_SHIFT(bucket) blk_shift[bucket]
+#endif 
+
+static u_short blk_shift[LOG_OF_MIN_ARENA * BUCKETS_PER_POW2] = 
+  { 
+#  if BUCKETS_PER_POW2==1
+      0, 0,
+      (MIN_BUC_POW2==2 ? 512 : 0),
+      256, 128, 64, 64,			/* 8 to 64 */
+      16*sizeof(union overhead), 
+      8*sizeof(union overhead), 
+      4*sizeof(union overhead), 
+      2*sizeof(union overhead), 
+#  else
+      0, 0, 0, 0,
+      (MIN_BUC_POW2==2 ? 512 : 0), (MIN_BUC_POW2==2 ? 512 : 0),
+      256, 260, 128, 128, 64, 80, 64, 48, /* 8 to 96 */
+      16*sizeof(union overhead), 16*sizeof(union overhead), 
+      8*sizeof(union overhead), 8*sizeof(union overhead), 
+      4*sizeof(union overhead), 4*sizeof(union overhead), 
+      2*sizeof(union overhead), 2*sizeof(union overhead), 
+#  endif 
+  };
 
 #else  /* !PACK_MALLOC */
 
 #  define OV_MAGIC(block,bucket) (block)->ov_magic
 #  define OV_INDEX(block) (block)->ov_index
 #  define CHUNK_SHIFT 1
+#  define MAX_PACKED -1
 #endif /* !PACK_MALLOC */
 
-#  define M_OVERHEAD (sizeof(union overhead) + RSLOP)
+#define M_OVERHEAD (sizeof(union overhead) + RSLOP)
+
+#ifdef PACK_MALLOC
+#  define MEM_OVERHEAD(bucket) \
+  (bucket <= MAX_PACKED ? 0 : M_OVERHEAD)
+#  ifdef SMALL_BUCKET_VIA_TABLE
+#    define START_SHIFTS_BUCKET ((MAX_PACKED_POW2 + 1) * BUCKETS_PER_POW2)
+#    define START_SHIFT MAX_PACKED_POW2
+#    ifdef BUCKETS_ROOT2		/* Chunks of size 3*2^n. */
+#      define SIZE_TABLE_MAX 80
+#    else
+#      define SIZE_TABLE_MAX 64
+#    endif 
+static char bucket_of[] =
+  {
+#    ifdef BUCKETS_ROOT2		/* Chunks of size 3*2^n. */
+      /* 0 to 15 in 4-byte increments. */
+      (sizeof(void*) > 4 ? 6 : 5),	/* 4/8, 5-th bucket for better reports */
+      6,				/* 8 */
+      7, 8,				/* 12, 16 */
+      9, 9, 10, 10,			/* 24, 32 */
+      11, 11, 11, 11,			/* 48 */
+      12, 12, 12, 12,			/* 64 */
+      13, 13, 13, 13,			/* 80 */
+      13, 13, 13, 13			/* 80 */
+#    else /* !BUCKETS_ROOT2 */
+      /* 0 to 15 in 4-byte increments. */
+      (sizeof(void*) > 4 ? 3 : 2),
+      3, 
+      4, 4, 
+      5, 5, 5, 5,
+      6, 6, 6, 6,
+      6, 6, 6, 6
+#    endif /* !BUCKETS_ROOT2 */
+  };
+#  else  /* !SMALL_BUCKET_VIA_TABLE */
+#    define START_SHIFTS_BUCKET MIN_BUCKET
+#    define START_SHIFT (MIN_BUC_POW2 - 1)
+#  endif /* !SMALL_BUCKET_VIA_TABLE */
+#else  /* !PACK_MALLOC */
+#  define MEM_OVERHEAD(bucket) M_OVERHEAD
+#  ifdef SMALL_BUCKET_VIA_TABLE
+#    undef SMALL_BUCKET_VIA_TABLE
+#  endif 
+#  define START_SHIFTS_BUCKET MIN_BUCKET
+#  define START_SHIFT (MIN_BUC_POW2 - 1)
+#endif /* !PACK_MALLOC */
 
 /*
  * Big allocations are often of the size 2^n bytes. To make them a
@@ -158,23 +356,63 @@ static u_short blk_shift[11 - 3] = {256,
 #  ifndef PERL_PAGESIZE
 #    define PERL_PAGESIZE 4096
 #  endif 
-#  ifndef FIRST_BIG_TWO_POT
-#    define FIRST_BIG_TWO_POT 14	/* 16K */
+#  ifndef FIRST_BIG_POW2
+#    define FIRST_BIG_POW2 15	/* 32K, 16K is used too often. */
 #  endif
-#  define FIRST_BIG_BLOCK (1<<FIRST_BIG_TWO_POT) /* 16K */
+#  define FIRST_BIG_BLOCK (1<<FIRST_BIG_POW2)
 /* If this value or more, check against bigger blocks. */
 #  define FIRST_BIG_BOUND (FIRST_BIG_BLOCK - M_OVERHEAD)
 /* If less than this value, goes into 2^n-overhead-block. */
 #  define LAST_SMALL_BOUND ((FIRST_BIG_BLOCK>>1) - M_OVERHEAD)
 
-#endif /* TWO_POT_OPTIMIZE */
+#  define POW2_OPTIMIZE_ADJUST(nbytes)				\
+   ((nbytes >= FIRST_BIG_BOUND) ? nbytes -= PERL_PAGESIZE : 0)
+#  define POW2_OPTIMIZE_SURPLUS(bucket)				\
+   ((bucket >= FIRST_BIG_POW2 * BUCKETS_PER_POW2) ? PERL_PAGESIZE : 0)
+
+#else  /* !TWO_POT_OPTIMIZE */
+#  define POW2_OPTIMIZE_ADJUST(nbytes)
+#  define POW2_OPTIMIZE_SURPLUS(bucket) 0
+#endif /* !TWO_POT_OPTIMIZE */
+
+#if defined(HAS_64K_LIMIT) && defined(PERL_CORE)
+#  define BARK_64K_LIMIT(what,nbytes,size)				\
+	if (nbytes > 0xffff) {						\
+		PerlIO_printf(PerlIO_stderr(),				\
+			      "%s too large: %lx\n", what, size);	\
+		my_exit(1);						\
+	}
+#else /* !HAS_64K_LIMIT || !PERL_CORE */
+#  define BARK_64K_LIMIT(what,nbytes,size)
+#endif /* !HAS_64K_LIMIT || !PERL_CORE */
+
+#ifndef MIN_SBRK
+#  define MIN_SBRK 2048
+#endif 
 
-#if defined(PERL_EMERGENCY_SBRK) && defined(PERL_CORE)
+#ifndef FIRST_SBRK
+#  define FIRST_SBRK (32*1024)
+#endif 
+
+/* Minimal sbrk in percents of what is already alloced. */
+#ifndef MIN_SBRK_FRAC
+#  define MIN_SBRK_FRAC 3
+#endif 
+
+#ifndef SBRK_ALLOW_FAILURES
+#  define SBRK_ALLOW_FAILURES 3
+#endif 
 
-#ifndef BIG_SIZE
-#  define BIG_SIZE (1<<16)		/* 64K */
+#ifndef SBRK_FAILURE_PRICE
+#  define SBRK_FAILURE_PRICE 50
 #endif 
 
+#if defined(PERL_EMERGENCY_SBRK) && defined(PERL_CORE)
+
+#  ifndef BIG_SIZE
+#    define BIG_SIZE (1<<16)		/* 64K */
+#  endif 
+
 static char *emergency_buffer;
 static MEM_SIZE emergency_buffer_size;
 
@@ -198,13 +436,13 @@ emergency_sbrk(size)
 
 	if (!gvp) gvp = (GV**)hv_fetch(defstash, "\015", 1, 0);
 	if (!gvp || !(sv = GvSV(*gvp)) || !SvPOK(sv) 
-	    || (SvLEN(sv) < (1<<11) - M_OVERHEAD)) 
+	    || (SvLEN(sv) < (1<<LOG_OF_MIN_ARENA) - M_OVERHEAD)) 
 	    return (char *)-1;		/* Now die die die... */
 
 	/* Got it, now detach SvPV: */
 	pv = SvPV(sv, na);
 	/* Check alignment: */
-	if (((u_int)(pv - M_OVERHEAD)) & ((1<<11) - 1)) {
+	if (((u_bigint)(pv - M_OVERHEAD)) & ((1<<LOG_OF_MIN_ARENA) - 1)) {
 	    PerlIO_puts(PerlIO_stderr(),"Bad alignment of $^M!\n");
 	    return (char *)-1;		/* die die die */
 	}
@@ -223,16 +461,16 @@ emergency_sbrk(size)
     return (char *)-1;			/* poor guy... */
 }
 
-#else /* !(defined(TWO_POT_OPTIMIZE) && defined(PERL_CORE)) */
+#else /* !(defined(PERL_EMERGENCY_SBRK) && defined(PERL_CORE)) */
 #  define emergency_sbrk(size)	-1
-#endif /* !(defined(TWO_POT_OPTIMIZE) && defined(PERL_CORE)) */
+#endif /* !(defined(PERL_EMERGENCY_SBRK) && defined(PERL_CORE)) */
 
 /*
- * nextf[i] is the pointer to the next free block of size 2^(i+3).  The
+ * nextf[i] is the pointer to the next free block of size 2^i.  The
  * smallest allocatable block is 8 bytes.  The overhead information
  * precedes the data area returned to the user.
  */
-#define	NBUCKETS 30
+#define	NBUCKETS (32*BUCKETS_PER_POW2 + 1)
 static	union overhead *nextf[NBUCKETS];
 
 #ifdef USE_PERL_SBRK
@@ -254,11 +492,12 @@ extern	Malloc_t sbrk(int);
  * for a given block size.
  */
 static	u_int nmalloc[NBUCKETS];
-static	u_int goodsbrk;
 static  u_int sbrk_slack;
 static  u_int start_slack;
 #endif
 
+static	u_int goodsbrk;
+
 #ifdef DEBUGGING
 #define	ASSERT(p)   if (!(p)) botch(STRINGIFY(p));  else
 static void
@@ -275,26 +514,18 @@ Malloc_t
 malloc(register size_t nbytes)
 {
   	register union overhead *p;
-  	register int bucket = 0;
+  	register int bucket;
   	register MEM_SIZE shiftr;
 
 #if defined(DEBUGGING) || defined(RCHECK)
 	MEM_SIZE size = nbytes;
 #endif
 
-#ifdef PERL_CORE
-#ifdef HAS_64K_LIMIT
-	if (nbytes > 0xffff) {
-		PerlIO_printf(PerlIO_stderr(),
-			      "Allocation too large: %lx\n", (long)nbytes);
-		my_exit(1);
-	}
-#endif /* HAS_64K_LIMIT */
+	BARK_64K_LIMIT("Allocation",nbytes,nbytes);
 #ifdef DEBUGGING
 	if ((long)nbytes < 0)
 		croak("panic: malloc");
 #endif
-#endif /* PERL_CORE */
 
 	MUTEX_LOCK(&malloc_mutex);
 	/*
@@ -304,29 +535,37 @@ malloc(register size_t nbytes)
 	 * space used per block for accounting.
 	 */
 #ifdef PACK_MALLOC
+#  ifdef SMALL_BUCKET_VIA_TABLE
+	if (nbytes == 0)
+	    bucket = MIN_BUCKET;
+	else if (nbytes <= SIZE_TABLE_MAX) {
+	    bucket = bucket_of[(nbytes - 1) >> BUCKET_TABLE_SHIFT];
+	} else
+#  else
 	if (nbytes == 0)
 	    nbytes = 1;
-	else if (nbytes > MAX_2_POT_ALGO)
-#endif
-	{
-#ifdef TWO_POT_OPTIMIZE
-		if (nbytes >= FIRST_BIG_BOUND)
-			nbytes -= PERL_PAGESIZE;
+	if (nbytes <= MAX_POW2_ALGO) goto do_shifts;
+	else
+#  endif
 #endif 
-		nbytes += M_OVERHEAD;
-		nbytes = (nbytes + 3) &~ 3; 
+	{
+	    POW2_OPTIMIZE_ADJUST(nbytes);
+	    nbytes += M_OVERHEAD;
+	    nbytes = (nbytes + 3) &~ 3; 
+	  do_shifts:
+	    shiftr = (nbytes - 1) >> START_SHIFT;
+	    bucket = START_SHIFTS_BUCKET;
+	    /* apart from this loop, this is O(1) */
+	    while (shiftr >>= 1)
+  		bucket += BUCKETS_PER_POW2;
 	}
-  	shiftr = (nbytes - 1) >> 2;
-	/* apart from this loop, this is O(1) */
-  	while (shiftr >>= 1)
-  		bucket++;
 	/*
 	 * If nothing in hash bucket right now,
 	 * request more memory from the system.
 	 */
   	if (nextf[bucket] == NULL)    
   		morecore(bucket);
-  	if ((p = (union overhead *)nextf[bucket]) == NULL) {
+  	if ((p = nextf[bucket]) == NULL) {
 		MUTEX_UNLOCK(&malloc_mutex);
 #ifdef PERL_CORE
 		if (!nomemok) {
@@ -338,10 +577,10 @@ malloc(register size_t nbytes)
 #endif
 	}
 
-#ifdef PERL_CORE
-    DEBUG_m(PerlIO_printf(Perl_debug_log, "0x%lx: (%05lu) malloc %ld bytes\n",
-	(unsigned long)(p+1),(unsigned long)(an++),(long)size));
-#endif /* PERL_CORE */
+	DEBUG_m(PerlIO_printf(Perl_debug_log,
+			      "0x%lx: (%05lu) malloc %ld bytes\n",
+			      (unsigned long)(p+1), (unsigned long)(an++),
+			      (long)size));
 
 	/* remove from linked list */
 #ifdef RCHECK
@@ -350,7 +589,10 @@ malloc(register size_t nbytes)
 		(unsigned long)*((int*)p),(unsigned long)p);
 #endif
   	nextf[bucket] = p->ov_next;
-	OV_MAGIC(p, bucket) = MAGIC;
+#ifdef IGNORE_SMALL_BAD_FREE
+	if (bucket >= FIRST_BUCKET_WITH_CHECK)
+#endif 
+	    OV_MAGIC(p, bucket) = MAGIC;
 #ifndef PACK_MALLOC
 	OV_INDEX(p) = bucket;
 #endif
@@ -359,16 +601,124 @@ malloc(register size_t nbytes)
 	 * Record allocated size of block and
 	 * bound space with magic numbers.
 	 */
-	nbytes = (size + M_OVERHEAD + 3) &~ 3; 
-  	if (nbytes <= 0x10000)
-		p->ov_size = nbytes - 1;
 	p->ov_rmagic = RMAGIC;
-  	*((u_int *)((caddr_t)p + nbytes - RSLOP)) = RMAGIC;
+	if (bucket <= MAX_SHORT_BUCKET) {
+	    int i;
+	    
+	    nbytes = size + M_OVERHEAD; 
+	    p->ov_size = nbytes - 1;
+	    if ((i = nbytes & 3)) {
+		i = 4 - i;
+		while (i--)
+		    *((char *)((caddr_t)p + nbytes - RSLOP + i)) = RMAGIC_C;
+	    }
+	    nbytes = (nbytes + 3) &~ 3; 
+	    *((u_int *)((caddr_t)p + nbytes - RSLOP)) = RMAGIC;
+	}
 #endif
 	MUTEX_UNLOCK(&malloc_mutex);
   	return ((Malloc_t)(p + CHUNK_SHIFT));
 }
 
+static char *last_sbrk_top;
+static char *last_op;			/* This arena can be easily extended. */
+static int sbrked_remains;
+static int sbrk_good = SBRK_ALLOW_FAILURES * SBRK_FAILURE_PRICE;
+
+#ifdef DEBUGGING_MSTATS
+static int sbrks;
+#endif 
+
+struct chunk_chain_s {
+    struct chunk_chain_s *next;
+    MEM_SIZE size;
+};
+static struct chunk_chain_s *chunk_chain;
+static int n_chunks;
+static char max_bucket;
+
+/* Cutoff a piece of one of the chunks in the chain.  Prefer smaller chunk. */
+static void *
+get_from_chain(MEM_SIZE size)
+{
+    struct chunk_chain_s *elt = chunk_chain, **oldp = &chunk_chain;
+    struct chunk_chain_s **oldgoodp = NULL;
+    long min_remain = LONG_MAX;
+
+    while (elt) {
+	if (elt->size >= size) {
+	    long remains = elt->size - size;
+	    if (remains >= 0 && remains < min_remain) {
+		oldgoodp = oldp;
+		min_remain = remains;
+	    }
+	    if (remains == 0) {
+		break;
+	    }
+	}
+	oldp = &( elt->next );
+	elt = elt->next;
+    }
+    if (!oldgoodp) return NULL;
+    if (min_remain) {
+	void *ret = *oldgoodp;
+	struct chunk_chain_s *next = (*oldgoodp)->next;
+	
+	*oldgoodp = (struct chunk_chain_s *)((char*)ret + size);
+	(*oldgoodp)->size = min_remain;
+	(*oldgoodp)->next = next;
+	return ret;
+    } else {
+	void *ret = *oldgoodp;
+	*oldgoodp = (*oldgoodp)->next;
+	n_chunks--;
+	return ret;
+    }
+}
+
+static void
+add_to_chain(void *p, MEM_SIZE size, MEM_SIZE chip)
+{
+    struct chunk_chain_s *next = chunk_chain;
+    char *cp = (char*)p;
+    
+    cp += chip;
+    chunk_chain = (struct chunk_chain_s *)cp;
+    chunk_chain->size = size - chip;
+    chunk_chain->next = next;
+    n_chunks++;
+}
+
+static void *
+get_from_bigger_buckets(int bucket, MEM_SIZE size)
+{
+    int price = 1;
+    static int bucketprice[NBUCKETS];
+    while (bucket <= max_bucket) {
+	/* We postpone stealing from bigger buckets until we want it
+	   often enough. */
+	if (nextf[bucket] && bucketprice[bucket]++ >= price) {
+	    /* Steal it! */
+	    void *ret = (void*)(nextf[bucket] - 1 + CHUNK_SHIFT);
+	    bucketprice[bucket] = 0;
+	    if (((char*)nextf[bucket]) - M_OVERHEAD == last_op) {
+		last_op = NULL;		/* Disable optimization */
+	    }
+	    nextf[bucket] = nextf[bucket]->ov_next;
+#ifdef DEBUGGING_MSTATS
+	    nmalloc[bucket]--;
+	    start_slack -= M_OVERHEAD;
+#endif 
+	    add_to_chain(ret, (BUCKET_SIZE(bucket) +
+			       POW2_OPTIMIZE_SURPLUS(bucket)), 
+			 size);
+	    return ret;
+	}
+	bucket++;
+    }
+    return NULL;
+}
+
 /*
  * Allocate more memory to the indicated bucket.
  */
@@ -383,90 +733,206 @@ morecore(register int bucket)
 
   	if (nextf[bucket])
   		return;
-	if (bucket == (sizeof(MEM_SIZE)*8 - 3)) {
+	if (bucket == sizeof(MEM_SIZE)*8*BUCKETS_PER_POW2) {
 	    croak("Allocation too large");
 	}
-	/*
-	 * Insure memory is allocated
-	 * on a page boundary.  Should
-	 * make getpageize call?
-	 */
-#ifndef atarist /* on the atari we dont have to worry about this */
-  	ovp = (union overhead *)sbrk(0);
-#  ifndef I286
-  	if ((UV)ovp & (0x7FF >> CHUNK_SHIFT)) {
-	    slack = (0x800 >> CHUNK_SHIFT) - ((UV)ovp & (0x7FF >> CHUNK_SHIFT));
-	    (void)sbrk(slack);
-#    if defined(DEBUGGING_MSTATS)
-	    sbrk_slack += slack;
-#    endif
+
+	if (bucket > max_bucket) {
+	    max_bucket = bucket;
 	}
-#  else
-	/* The sbrk(0) call on the I286 always returns the next segment */
+  	rnu = ( (bucket <= (LOG_OF_MIN_ARENA << BUCKET_POW2_SHIFT)) 
+		? LOG_OF_MIN_ARENA 
+		: (bucket >> BUCKET_POW2_SHIFT) );
+	/* This may be overwritten later: */
+  	nblks = 1 << (rnu - (bucket >> BUCKET_POW2_SHIFT)); /* how many blocks to get */
+	needed = ((MEM_SIZE)1 << rnu) + POW2_OPTIMIZE_SURPLUS(bucket);
+	if (nextf[rnu << BUCKET_POW2_SHIFT]) { /* 2048b bucket. */
+	    ovp = nextf[rnu << BUCKET_POW2_SHIFT] - 1 + CHUNK_SHIFT;
+	    nextf[rnu << BUCKET_POW2_SHIFT]
+		= nextf[rnu << BUCKET_POW2_SHIFT]->ov_next;
+#ifdef DEBUGGING_MSTATS
+	    nmalloc[rnu << BUCKET_POW2_SHIFT]--;
+	    start_slack -= M_OVERHEAD;
+#endif 
+	    DEBUG_m(PerlIO_printf(Perl_debug_log, 
+				  "stealing %ld bytes from %ld arena\n",
+				  (long) needed, (long) rnu << BUCKET_POW2_SHIFT));
+	} else if (chunk_chain 
+		   && (ovp = (union overhead*) get_from_chain(needed))) {
+	    DEBUG_m(PerlIO_printf(Perl_debug_log, 
+				  "stealing %ld bytes from chain\n",
+				  (long) needed));
+	} else if (ovp = (union overhead*)
+		   get_from_bigger_buckets((rnu << BUCKET_POW2_SHIFT) + 1,
+					   needed)) {
+	    DEBUG_m(PerlIO_printf(Perl_debug_log, 
+				  "stealing %ld bytes from bigger buckets\n",
+				  (long) needed));
+	} else if (needed <= sbrked_remains) {
+	    ovp = (union overhead *)(last_sbrk_top - sbrked_remains);
+	    sbrked_remains -= needed;
+	    last_op = (char*)ovp;
+	} else {
+	    /* Need to do (possibly expensive) system call. Try to
+	       optimize it for rare calling. */
+	    MEM_SIZE require = needed - sbrked_remains;
+	    char *cp;
+
+	    if (sbrk_good > 0) {
+		if (!last_sbrk_top && require < FIRST_SBRK) 
+		    require = FIRST_SBRK;
+		else if (require < MIN_SBRK) require = MIN_SBRK;
+
+		if (require < goodsbrk * MIN_SBRK_FRAC / 100)
+		    require = goodsbrk * MIN_SBRK_FRAC / 100;
+		require = ((require - 1 + MIN_SBRK) / MIN_SBRK) * MIN_SBRK;
+	    } else {
+		require = needed;
+		last_sbrk_top = 0;
+		sbrked_remains = 0;
+	    }
+
+	    DEBUG_m(PerlIO_printf(Perl_debug_log, 
+				  "sbrk(%ld) for %ld-byte-long arena\n",
+				  (long)require, (long) needed));
+	    cp = (char *)sbrk(require);
+#ifdef DEBUGGING_MSTATS
+	    sbrks++;
+#endif 
+	    if (cp == last_sbrk_top) {
+		/* Common case, anything is fine. */
+		sbrk_good++;
+		ovp = (union overhead *) (cp - sbrked_remains);
+		sbrked_remains = require - (needed - sbrked_remains);
+	    } else if (cp == (char *)-1) { /* no more room! */
+		ovp = (union overhead *)emergency_sbrk(needed);
+		if (ovp == (union overhead *)-1)
+		    return;
+		goto gotit;
+	    } else {			/* Non-continuous or first sbrk(). */
+		long add = sbrked_remains;
+		char *newcp;
+
+		if (sbrked_remains) {	/* Put rest into chain, we
+					   cannot use it right now. */
+		    add_to_chain((void*)(last_sbrk_top - sbrked_remains),
+				 sbrked_remains, 0);
+		}
+
+		/* Second, check alignment. */
+		slack = 0;
+
+#ifndef atarist /* on the atari we dont have to worry about this */
+#  ifndef I286 	/* The sbrk(0) call on the I286 always returns the next segment */
+
+		/* CHUNK_SHIFT is 1 for PACK_MALLOC, 0 otherwise. */
+		if ((UV)cp & (0x7FF >> CHUNK_SHIFT)) { /* Not aligned. */
+		    slack = (0x800 >> CHUNK_SHIFT)
+			- ((UV)cp & (0x7FF >> CHUNK_SHIFT));
+		    add += slack;
+		}
 #  endif
 #endif /* atarist */
-
-#if !(defined(I286) || defined(atarist))
-	/* take 2k unless the block is bigger than that */
-  	rnu = (bucket <= 8) ? 11 : bucket + 3;
-#else
-	/* take 16k unless the block is bigger than that 
-	   (80286s like large segments!), probably good on the atari too */
-  	rnu = (bucket <= 11) ? 14 : bucket + 3;
+		
+		if (add) {
+		    DEBUG_m(PerlIO_printf(Perl_debug_log, 
+"sbrk(%ld) to fix non-continuous/off-page sbrk:\n\t%ld for alignement,\t%ld were assumed to come from the tail of the previous sbrk\n",
+					  (long)add, (long) slack,
+					  (long) sbrked_remains));
+		    newcp = (char *)sbrk(add);
+#if defined(DEBUGGING_MSTATS)
+		    sbrks++;
+		    sbrk_slack += add;
+#endif
+		    if (newcp != cp + require) {
+			/* Too bad: even rounding sbrk() is not continuous.*/
+			DEBUG_m(PerlIO_printf(Perl_debug_log, 
+					      "failed to fix bad sbrk()\n"));
+#ifdef PACK_MALLOC
+			if (slack)
+			    croak("panic: Off-page sbrk");
 #endif
-  	nblks = 1 << (rnu - (bucket + 3));  /* how many blocks to get */
-	needed = (MEM_SIZE)1 << rnu;
-#ifdef TWO_POT_OPTIMIZE
-	needed += (bucket >= (FIRST_BIG_TWO_POT - 3) ? PERL_PAGESIZE : 0);
+			if (sbrked_remains) {
+			    /* Try again. */
+#if defined(DEBUGGING_MSTATS)
+			    sbrk_slack += require;
+#endif
+			    require = needed;
+			    DEBUG_m(PerlIO_printf(Perl_debug_log, 
+						  "straight sbrk(%ld)\n",
+						  (long)require));
+			    cp = (char *)sbrk(require);
+#ifdef DEBUGGING_MSTATS
+			    sbrks++;
 #endif 
-	ovp = (union overhead *)sbrk(needed);
-	/* no more room! */
-  	if (ovp == (union overhead *)-1) {
-	    ovp = (union overhead *)emergency_sbrk(needed);
-	    if (ovp == (union overhead *)-1)
-  		return;
-	}
+			    if (cp == (char *)-1)
+				return;
+			}
+			sbrk_good = -1;	/* Disable optimization!
+					   Continue with not-aligned... */
+		    } else {
+			cp += slack;
+			require += sbrked_remains;
+		    }
+		}
+
+		if (last_sbrk_top) {
+		    sbrk_good -= SBRK_FAILURE_PRICE;
+		}
+
+		ovp = (union overhead *) cp;
+		/*
+		 * Round up to minimum allocation size boundary
+		 * and deduct from block count to reflect.
+		 */
+
+#ifndef I286	/* Again, this should always be ok on an 80286 */
+		if ((UV)ovp & 7) {
+		    ovp = (union overhead *)(((UV)ovp + 8) & ~7);
+		    DEBUG_m(PerlIO_printf(Perl_debug_log, 
+					  "fixing sbrk(): %d bytes off machine alignement\n",
+					  (int)((UV)ovp & 7)));
+		    nblks--;
+# if defined(DEBUGGING_MSTATS)
+		    /* This is only approx. if TWO_POT_OPTIMIZE: */
+		    sbrk_slack += (1 << bucket);
+# endif
+		}
+#endif
+		sbrked_remains = require - needed;
+	    }
+	    last_sbrk_top = cp + require;
+	    last_op = (char*) cp;
 #ifdef DEBUGGING_MSTATS
-	goodsbrk += needed;
+	    goodsbrk += require;
 #endif	
-	/*
-	 * Round up to minimum allocation size boundary
-	 * and deduct from block count to reflect.
-	 */
-#ifndef I286
-#  ifdef PACK_MALLOC
-	if ((UV)ovp & 0x7FF)
-		croak("panic: Off-page sbrk");
-#  endif
-  	if ((UV)ovp & 7) {
-  		ovp = (union overhead *)(((UV)ovp + 8) & ~7);
-  		nblks--;
-  	}
-#else
-	/* Again, this should always be ok on an 80286 */
-#endif
+	}
+
+  gotit:
 	/*
 	 * Add new memory allocated to that on
 	 * free list for this hash bucket.
 	 */
-  	siz = 1 << (bucket + 3);
+  	siz = BUCKET_SIZE(bucket);
 #ifdef PACK_MALLOC
 	*(u_char*)ovp = bucket;	/* Fill index. */
-	if (bucket <= MAX_PACKED - 3) {
-	    ovp = (union overhead *) ((char*)ovp + blk_shift[bucket]);
-	    nblks = n_blks[bucket];
+	if (bucket <= MAX_PACKED) {
+	    ovp = (union overhead *) ((char*)ovp + BLK_SHIFT(bucket));
+	    nblks = N_BLKS(bucket);
 #  ifdef DEBUGGING_MSTATS
-	    start_slack += blk_shift[bucket];
+	    start_slack += BLK_SHIFT(bucket);
 #  endif
-	} else if (bucket <= 11 - 1 - 3) {
-	    ovp = (union overhead *) ((char*)ovp + blk_shift[bucket]);
-	    /* nblks = n_blks[bucket]; */
+	} else if (bucket < LOG_OF_MIN_ARENA * BUCKETS_PER_POW2) {
+	    ovp = (union overhead *) ((char*)ovp + BLK_SHIFT(bucket));
 	    siz -= sizeof(union overhead);
 	} else ovp++;		/* One chunk per block. */
-#endif /* !PACK_MALLOC */
+#endif /* PACK_MALLOC */
   	nextf[bucket] = ovp;
 #ifdef DEBUGGING_MSTATS
 	nmalloc[bucket] += nblks;
+	if (bucket > MAX_PACKED) {
+	    start_slack += M_OVERHEAD * nblks;
+	}
 #endif 
   	while (--nblks > 0) {
 		ovp->ov_next = (union overhead *)((caddr_t)ovp + siz);
@@ -475,11 +941,12 @@ morecore(register int bucket)
 	/* Not all sbrks return zeroed memory.*/
 	ovp->ov_next = (union overhead *)NULL;
 #ifdef PACK_MALLOC
-	if (bucket == 7 - 3) {	/* Special case, explanation is above. */
-	    union overhead *n_op = nextf[7 - 3]->ov_next;
-	    nextf[7 - 3] = (union overhead *)((caddr_t)nextf[7 - 3] 
-					      - sizeof(union overhead));
-	    nextf[7 - 3]->ov_next = n_op;
+	if (bucket == 7*BUCKETS_PER_POW2) { /* Special case, explanation is above. */
+	    union overhead *n_op = nextf[7*BUCKETS_PER_POW2]->ov_next;
+	    nextf[7*BUCKETS_PER_POW2] = 
+		(union overhead *)((caddr_t)nextf[7*BUCKETS_PER_POW2] 
+				   - sizeof(union overhead));
+	    nextf[7*BUCKETS_PER_POW2]->ov_next = n_op;
 	}
 #endif /* !PACK_MALLOC */
 }
@@ -494,18 +961,24 @@ free(void *mp)
 	u_char bucket;
 #endif 
 
-#ifdef PERL_CORE
-    DEBUG_m(PerlIO_printf(Perl_debug_log, "0x%lx: (%05lu) free\n",(unsigned long)cp,(unsigned long)(an++)));
-#endif /* PERL_CORE */
+	DEBUG_m(PerlIO_printf(Perl_debug_log, 
+			      "0x%lx: (%05lu) free\n",
+			      (unsigned long)cp, (unsigned long)(an++)));
 
 	if (cp == NULL)
 		return;
 	ovp = (union overhead *)((caddr_t)cp 
-				 - sizeof (union overhead) * CHUNK_SHIFT);
+				- sizeof (union overhead) * CHUNK_SHIFT);
 #ifdef PACK_MALLOC
 	bucket = OV_INDEX(ovp);
 #endif 
-	if (OV_MAGIC(ovp, bucket) != MAGIC) {
+#ifdef IGNORE_SMALL_BAD_FREE
+	if ((bucket >= FIRST_BUCKET_WITH_CHECK) 
+	    && (OV_MAGIC(ovp, bucket) != MAGIC))
+#else
+	if (OV_MAGIC(ovp, bucket) != MAGIC)
+#endif 
+	    {
 		static int bad_free_warn = -1;
 		if (bad_free_warn == -1) {
 		    char *pbf = PerlEnv_getenv("PERL_BADFREE");
@@ -520,12 +993,24 @@ free(void *mp)
 		warn("Bad free() ignored");
 #endif
 		return;				/* sanity */
-	}
+	    }
 	MUTEX_LOCK(&malloc_mutex);
 #ifdef RCHECK
   	ASSERT(ovp->ov_rmagic == RMAGIC);
-	if (OV_INDEX(ovp) <= MAX_SHORT_BUCKET)
-		ASSERT(*(u_int *)((caddr_t)ovp + ovp->ov_size + 1 - RSLOP) == RMAGIC);
+	if (OV_INDEX(ovp) <= MAX_SHORT_BUCKET) {
+	    int i;
+	    MEM_SIZE nbytes = ovp->ov_size + 1;
+
+	    if ((i = nbytes & 3)) {
+		i = 4 - i;
+		while (i--) {
+		    ASSERT(*((char *)((caddr_t)ovp + nbytes - RSLOP + i))
+			   == RMAGIC_C);
+		}
+	    }
+	    nbytes = (nbytes + 3) &~ 3; 
+	    ASSERT(*(u_int *)((caddr_t)ovp + nbytes - RSLOP) == RMAGIC);	    
+	}
 	ovp->ov_rmagic = RMAGIC - 1;
 #endif
   	ASSERT(OV_INDEX(ovp) < NBUCKETS);
@@ -553,36 +1038,33 @@ realloc(void *mp, size_t nbytes)
 {   
   	register MEM_SIZE onb;
 	union overhead *ovp;
-  	char *res;
-	register int i;
-	int was_alloced = 0;
+  	char *res, prev_bucket;
+	register int bucket;
+	int was_alloced = 0, incr;
 	char *cp = (char*)mp;
 
-#ifdef DEBUGGING
+#if defined(DEBUGGING) || !defined(PERL_CORE)
 	MEM_SIZE size = nbytes;
-#endif
 
-#ifdef PERL_CORE
-#ifdef HAS_64K_LIMIT
-	if (nbytes > 0xffff) {
-		PerlIO_printf(PerlIO_stderr(),
-			      "Reallocation too large: %lx\n", size);
-		my_exit(1);
-	}
-#endif /* HAS_64K_LIMIT */
-	if (!cp)
-		return malloc(nbytes);
-#ifdef DEBUGGING
 	if ((long)nbytes < 0)
 		croak("panic: realloc");
 #endif
-#endif /* PERL_CORE */
+
+	BARK_64K_LIMIT("Reallocation",nbytes,size);
+	if (!cp)
+		return malloc(nbytes);
 
 	MUTEX_LOCK(&malloc_mutex);
 	ovp = (union overhead *)((caddr_t)cp 
-				 - sizeof (union overhead) * CHUNK_SHIFT);
-	i = OV_INDEX(ovp);
-	if (OV_MAGIC(ovp, i) == MAGIC) {
+				- sizeof (union overhead) * CHUNK_SHIFT);
+	bucket = OV_INDEX(ovp);
+#ifdef IGNORE_SMALL_BAD_FREE
+	if ((bucket < FIRST_BUCKET_WITH_CHECK) 
+	    || (OV_MAGIC(ovp, bucket) == MAGIC))
+#else
+	if (OV_MAGIC(ovp, bucket) == MAGIC) 
+#endif 
+	{
 		was_alloced = 1;
 	} else {
 		/*
@@ -596,40 +1078,57 @@ realloc(void *mp, size_t nbytes)
 		 * the memory block being realloc'd is the
 		 * smallest possible.
 		 */
-		if ((i = findbucket(ovp, 1)) < 0 &&
-		    (i = findbucket(ovp, reall_srchlen)) < 0)
-			i = 0;
+		if ((bucket = findbucket(ovp, 1)) < 0 &&
+		    (bucket = findbucket(ovp, reall_srchlen)) < 0)
+			bucket = 0;
 	}
-	onb = (1L << (i + 3)) - 
-#ifdef PACK_MALLOC
-	    (i <= (MAX_PACKED - 3) ? 0 : M_OVERHEAD)
-#else
-	    M_OVERHEAD
-#endif
-#ifdef TWO_POT_OPTIMIZE
-	    + (i >= (FIRST_BIG_TWO_POT - 3) ? PERL_PAGESIZE : 0)
-#endif
-	    ;
+	onb = BUCKET_SIZE_REAL(bucket);
 	/* 
 	 *  avoid the copy if same size block.
-	 *  We are not agressive with boundary cases. Note that it is
-	 *  possible for small number of cases give false negative if
+	 *  We are not agressive with boundary cases. Note that it might
+	 *  (for a small number of cases) give false negative if
 	 *  both new size and old one are in the bucket for
-	 *  FIRST_BIG_TWO_POT, but the new one is near the lower end.
+	 *  FIRST_BIG_POW2, but the new one is near the lower end.
+	 *
+	 *  We do not try to go to 1.5 times smaller bucket so far.
 	 */
-	if (was_alloced &&
-	    nbytes <= onb && (nbytes > ( (onb >> 1) - M_OVERHEAD )
-#ifdef TWO_POT_OPTIMIZE
-			      || (i == (FIRST_BIG_TWO_POT - 3) 
-				  && nbytes >= LAST_SMALL_BOUND )
-#endif	
-		)) {
+	if (nbytes > onb) incr = 1;
+	else {
+#ifdef DO_NOT_TRY_HARDER_WHEN_SHRINKING
+	    if ( /* This is a little bit pessimal if PACK_MALLOC: */
+		nbytes > ( (onb >> 1) - M_OVERHEAD )
+#  ifdef TWO_POT_OPTIMIZE
+		|| (bucket == FIRST_BIG_POW2 && nbytes >= LAST_SMALL_BOUND )
+#  endif	
+		)
+#else  /* !DO_NOT_TRY_HARDER_WHEN_SHRINKING */
+		prev_bucket = ( (bucket > MAX_PACKED + 1) 
+				? bucket - BUCKETS_PER_POW2
+				: bucket - 1);
+	     if (nbytes > BUCKET_SIZE_REAL(prev_bucket))
+#endif /* !DO_NOT_TRY_HARDER_WHEN_SHRINKING */
+		 incr = 0;
+	     else incr = -1;
+	}
+	if (!was_alloced) goto hard_way;
+	else if (incr == 0) {
+	  inplace:
 #ifdef RCHECK
 		/*
 		 * Record new allocated size of block and
 		 * bound space with magic numbers.
 		 */
 		if (OV_INDEX(ovp) <= MAX_SHORT_BUCKET) {
+		       int i, nb = ovp->ov_size + 1;
+
+		       if ((i = nb & 3)) {
+			   i = 4 - i;
+			   while (i--) {
+			       ASSERT(*((char *)((caddr_t)ovp + nb - RSLOP + i)) == RMAGIC_C);
+			   }
+		       }
+		       nb = (nb + 3) &~ 3; 
+		       ASSERT(*(u_int *)((caddr_t)ovp + nb - RSLOP) == RMAGIC);
 			/*
 			 * Convert amount of memory requested into
 			 * closest block size stored in hash buckets
@@ -637,33 +1136,87 @@ realloc(void *mp, size_t nbytes)
 			 * space used per block for accounting.
 			 */
 			nbytes += M_OVERHEAD;
-			nbytes = (nbytes + 3) &~ 3; 
 			ovp->ov_size = nbytes - 1;
+			if ((i = nbytes & 3)) {
+			    i = 4 - i;
+			    while (i--)
+				*((char *)((caddr_t)ovp + nbytes - RSLOP + i))
+				    = RMAGIC_C;
+			}
+			nbytes = (nbytes + 3) &~ 3; 
 			*((u_int *)((caddr_t)ovp + nbytes - RSLOP)) = RMAGIC;
 		}
 #endif
 		res = cp;
 		MUTEX_UNLOCK(&malloc_mutex);
-	}
-	else {
-		MUTEX_UNLOCK(&malloc_mutex);
-		if ((res = (char*)malloc(nbytes)) == NULL)
-			return (NULL);
-		if (cp != res)			/* common optimization */
-			Copy(cp, res, (MEM_SIZE)(nbytes<onb?nbytes:onb), char);
-		if (was_alloced)
-			free(cp);
+	} else if (incr == 1 && (cp - M_OVERHEAD == last_op) 
+		   && (onb > (1 << LOG_OF_MIN_ARENA))) {
+	    MEM_SIZE require, newarena = nbytes, pow;
+	    int shiftr;
+
+	    POW2_OPTIMIZE_ADJUST(newarena);
+	    newarena = newarena + M_OVERHEAD;
+	    /* newarena = (newarena + 3) &~ 3; */
+	    shiftr = (newarena - 1) >> LOG_OF_MIN_ARENA;
+	    pow = LOG_OF_MIN_ARENA + 1;
+	    /* apart from this loop, this is O(1) */
+	    while (shiftr >>= 1)
+  		pow++;
+	    newarena = (1 << pow) + POW2_OPTIMIZE_SURPLUS(pow * BUCKETS_PER_POW2);
+	    require = newarena - onb - M_OVERHEAD;
+	    
+	    if (require <= sbrked_remains) {
+		sbrked_remains -= require;
+	    } else {
+		char *cp;
+
+		require -= sbrked_remains;
+		/* We do not try to optimize sbrks here, we go for place. */
+		cp = (char*) sbrk(require);
+#ifdef DEBUGGING_MSTATS
+		sbrks++;
+		goodsbrk += require;
+#endif 
+		if (cp == last_sbrk_top) {
+		    sbrked_remains = 0;
+		    last_sbrk_top = cp + require;
+		} else {
+		    /* Report the failure: */
+		    if (sbrked_remains)
+			add_to_chain((void*)(last_sbrk_top - sbrked_remains),
+				     sbrked_remains, 0);
+		    add_to_chain((void*)cp, require, 0);
+		    sbrk_good -= SBRK_FAILURE_PRICE;
+		    sbrked_remains = 0;
+		    last_sbrk_top = 0;
+		    last_op = 0;
+		    goto hard_way;
+		}
+	    }
+	    
+#ifdef DEBUGGING_MSTATS
+	    nmalloc[bucket]--;
+	    nmalloc[pow * BUCKETS_PER_POW2]++;
+#endif 	    
+	    *(cp - M_OVERHEAD) = pow * BUCKETS_PER_POW2; /* Fill index. */
+	    goto inplace;
+	} else {
+	  hard_way:
+	    MUTEX_UNLOCK(&malloc_mutex);
+	    if ((res = (char*)malloc(nbytes)) == NULL)
+		return (NULL);
+	    if (cp != res)			/* common optimization */
+		Copy(cp, res, (MEM_SIZE)(nbytes<onb?nbytes:onb), char);
+	    if (was_alloced)
+		free(cp);
 	}
 
-#ifdef PERL_CORE
-#ifdef DEBUGGING
-    if (debug & 128) {
-	PerlIO_printf(Perl_debug_log, "0x%lx: (%05lu) rfree\n",(unsigned long)res,(unsigned long)(an++));
-	PerlIO_printf(Perl_debug_log, "0x%lx: (%05lu) realloc %ld bytes\n",
-	    (unsigned long)res,(unsigned long)(an++),(long)size);
-    }
-#endif
-#endif /* PERL_CORE */
+	DEBUG_m(PerlIO_printf(Perl_debug_log, "0x%lu: (%05lu) rfree\n",
+			      (unsigned long)res,(unsigned long)(an++)));
+	DEBUG_m(PerlIO_printf(Perl_debug_log, 
+			      "0x%lx: (%05lu) realloc %ld bytes\n",
+			      (unsigned long)res,(unsigned long)(an++),
+			      (long)size));
   	return ((Malloc_t)res);
 }
 
@@ -701,7 +1254,21 @@ calloc(register size_t elements, registe
     return p;
 }
 
+MEM_SIZE
+malloced_size(void *p)
+{
+    int bucket = OV_INDEX((union overhead *)p);
+
+    return BUCKET_SIZE_REAL(bucket);
+}
+
 #ifdef DEBUGGING_MSTATS
+
+#  ifdef BUCKETS_ROOT2
+#    define MIN_EVEN_REPORT 6
+#  else
+#    define MIN_EVEN_REPORT MIN_BUCKET
+#  endif 
 /*
  * mstats - print out statistics about malloc
  * 
@@ -714,31 +1281,72 @@ dump_mstats(char *s)
 {
   	register int i, j;
   	register union overhead *p;
-  	int topbucket=0, totfree=0, total=0;
+  	int topbucket=0, topbucket_ev=0, topbucket_odd=0, totfree=0, total=0;
 	u_int nfree[NBUCKETS];
+	int total_chain = 0;
+	struct chunk_chain_s* nextchain = chunk_chain;
 
-  	for (i=0; i < NBUCKETS; i++) {
+  	for (i = MIN_BUCKET ; i < NBUCKETS; i++) {
   		for (j = 0, p = nextf[i]; p; p = p->ov_next, j++)
   			;
 		nfree[i] = j;
-  		totfree += nfree[i]   * (1 << (i + 3));
-  		total += nmalloc[i] * (1 << (i + 3));
-		if (nmalloc[i])
-			topbucket = i;
+  		totfree += nfree[i] * BUCKET_SIZE_REAL(i);
+  		total += nmalloc[i] * BUCKET_SIZE_REAL(i);
+		if (nmalloc[i]) {
+		    i % 2 ? (topbucket_odd = i) : (topbucket_ev = i);
+		    topbucket = i;
+		}
   	}
   	if (s)
-		PerlIO_printf(PerlIO_stderr(), "Memory allocation statistics %s (buckets 8..%d)\n",
-			s, (1 << (topbucket + 3)) );
+	    PerlIO_printf(PerlIO_stderr(),
+			  "Memory allocation statistics %s (buckets %d(%d)..%d(%d)\n",
+			  s, 
+			  BUCKET_SIZE_REAL(MIN_BUCKET), 
+			  BUCKET_SIZE(MIN_BUCKET),
+			  BUCKET_SIZE_REAL(topbucket), BUCKET_SIZE(topbucket));
   	PerlIO_printf(PerlIO_stderr(), "%8d free:", totfree);
-  	for (i=0; i <= topbucket; i++) {
-  		PerlIO_printf(PerlIO_stderr(), (i<5 || i==7)?" %5d": (i<9)?" %3d":" %d", nfree[i]);
+  	for (i = MIN_EVEN_REPORT; i <= topbucket; i += BUCKETS_PER_POW2) {
+  		PerlIO_printf(PerlIO_stderr(), 
+			      ((i < 8*BUCKETS_PER_POW2 || i == 10*BUCKETS_PER_POW2)
+			       ? " %5d" 
+			       : ((i < 12*BUCKETS_PER_POW2) ? " %3d" : " %d")),
+			      nfree[i]);
+  	}
+#ifdef BUCKETS_ROOT2
+	PerlIO_printf(PerlIO_stderr(), "\n\t   ");
+  	for (i = MIN_BUCKET + 1; i <= topbucket_odd; i += BUCKETS_PER_POW2) {
+  		PerlIO_printf(PerlIO_stderr(), 
+			      ((i < 8*BUCKETS_PER_POW2 || i == 10*BUCKETS_PER_POW2)
+			       ? " %5d" 
+			       : ((i < 12*BUCKETS_PER_POW2) ? " %3d" : " %d")),
+			      nfree[i]);
   	}
+#endif 
   	PerlIO_printf(PerlIO_stderr(), "\n%8d used:", total - totfree);
-  	for (i=0; i <= topbucket; i++) {
-  		PerlIO_printf(PerlIO_stderr(), (i<5 || i==7)?" %5d": (i<9)?" %3d":" %d", nmalloc[i] - nfree[i]);
+  	for (i = MIN_EVEN_REPORT; i <= topbucket; i += BUCKETS_PER_POW2) {
+  		PerlIO_printf(PerlIO_stderr(), 
+			      ((i < 8*BUCKETS_PER_POW2 || i == 10*BUCKETS_PER_POW2)
+			       ? " %5d" 
+			       : ((i < 12*BUCKETS_PER_POW2) ? " %3d" : " %d")), 
+			      nmalloc[i] - nfree[i]);
   	}
-	PerlIO_printf(PerlIO_stderr(), "\nTotal sbrk(): %8d. Odd ends: sbrk(): %7d, malloc(): %7d bytes.\n",
-		      goodsbrk + sbrk_slack, sbrk_slack, start_slack);
+#ifdef BUCKETS_ROOT2
+	PerlIO_printf(PerlIO_stderr(), "\n\t   ");
+  	for (i = MIN_BUCKET + 1; i <= topbucket_odd; i += BUCKETS_PER_POW2) {
+  		PerlIO_printf(PerlIO_stderr(), 
+			      ((i < 8*BUCKETS_PER_POW2 || i == 10*BUCKETS_PER_POW2)
+			       ? " %5d" 
+			       : ((i < 12*BUCKETS_PER_POW2) ? " %3d" : " %d")),
+			      nmalloc[i] - nfree[i]);
+  	}
+#endif 
+	while (nextchain) {
+	    total_chain += nextchain->size;
+	    nextchain = nextchain->next;
+	}
+	PerlIO_printf(PerlIO_stderr(), "\nTotal sbrk(): %d/%d:%d. Odd ends: pad+heads+chain+tail: %d+%d+%d+%d.\n",
+		      goodsbrk + sbrk_slack, sbrks, sbrk_good, sbrk_slack,
+		      start_slack, total_chain, sbrked_remains);
 }
 #else
 void
@@ -799,9 +1407,6 @@ int size;
       if (size >= PERLSBRK_32_K) {
 	small = 0;
       } else {
-#ifndef PERL_CORE
-	reqsize = size;
-#endif
 	size = PERLSBRK_64_K;
 	small = 1;
       }
@@ -816,10 +1421,8 @@ int size;
       }
     }
 
-#ifdef PERL_CORE
     DEBUG_m(PerlIO_printf(Perl_debug_log, "sbrk malloc size %ld (reqsize %ld), left size %ld, give addr 0x%lx\n",
 		    size, reqsize, Perl_sbrk_oldsize, got));
-#endif
 
     return (void *)got;
 }