1#ifndef _GNU_SOURCE
   2#   define _GNU_SOURCE
   3#endif
   4
   5#include <errno.h>
   6#include <stdarg.h>
   7#include <stdbit.h>
   8#include <stdio.h>
   9#include <stdint.h>
  10#include <stdlib.h>
  11#include <string.h>
  12#include <threads.h>
  13#include <sys/mman.h>
  14
  15#include "alignment.h"
  16#include "allocator.h"
  17#include "dump.h"
  18#include "src/word.h"
  19
  20// unit size should not be less than size of pointer
  21#define UNIT_SIZE  16
  22
  23// threads serialization
  24static mtx_t lock;
  25
  26/****************************************************************
  27 * Trace/debug output
  28 */
  29
  30static void print_msg(const char* func_name, char* fmt, ...)
  31{
  32    fprintf(stderr, "Bitmap allocator -- %s: ", func_name);
  33    va_list ap;
  34    va_start(ap);
  35    vfprintf(stderr, fmt, ap);
  36    va_end(ap);
  37}
  38
  39#define ERR(...)  print_msg(__func__, __VA_ARGS__)
  40
  41#define SAY(...)  do { if (pet_allocator.verbose) { print_msg(__func__, __VA_ARGS__); } } while(false)
  42
  43#ifdef DEBUG
  44#   define TRACE(...)  do { if (pet_allocator.trace) { print_msg(__func__, __VA_ARGS__); } } while(false)
  45#else
  46#   define TRACE(...)
  47#endif
  48
  49
  50/****************************************************************
  51 * Stats
  52 */
  53
  54static AllocatorStats stats = {};
  55
  56static atomic_size_t num_bm_pages = 0;
  57
  58/****************************************************************
  59 * memory cleaning
  60 */
  61
  62static void cleanse(void* addr, unsigned start, unsigned end)
  63{
  64    TRACE("addr=%p, start=%u, end=%u\n", addr, start, end);
  65
  66    unsigned length = end - start;
  67
  68    // clean bytes till start of word
  69    uint8_t* byteptr = addr;
  70    byteptr += start;
  71    unsigned nbytes = start & (sizeof(Word) - 1);
  72    if (nbytes) {
  73        nbytes = sizeof(Word) - nbytes;
  74        if (nbytes > length) {
  75            nbytes = length;
  76        }
  77        TRACE("leading %u bytes\n", nbytes);
  78        for (unsigned i = 0; i < nbytes; i++) {
  79            *byteptr++ = 0;
  80            length--;
  81        }
  82    }
  83
  84    // clean words
  85    TRACE("%u words\n", length / sizeof(Word));
  86    Word* wordptr = (Word*) byteptr;
  87    while (length >= sizeof(Word)) {
  88        *wordptr++ = 0;
  89        length -= sizeof(Word);
  90    }
  91
  92    if (length) {
  93        // clean remaining bytes
  94        TRACE("remaining %u bytes\n", length);
  95        byteptr = (uint8_t*) wordptr;
  96        while (length--) {
  97            *byteptr++ = 0;
  98        }
  99    }
 100}
 101
 102/****************************************************************
 103 * mmap/mremap/munmap wrappers
 104 */
 105
 106static void* call_mmap(unsigned size, bool clean)
 107/*
 108 * call mmap to allocate pages
 109 *
 110 * size must be multiple of sys_page_size
 111 *
 112 * bear in mind mmap called immediately after munmap may return dirty page,
 113 * so explicit cleaning is a must
 114 */
 115{
 116    void* result = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 117    if (result == MAP_FAILED) {
 118        ERR("mmap: %s\n", strerror(errno));
 119        return nullptr;
 120    }
 121    if (clean) {
 122        cleanse(result, 0, size);
 123    }
 124    return result;
 125}
 126
 127static inline void call_munmap(void* addr, unsigned size)
 128{
 129    if (munmap(addr, size) == -1) {
 130        ERR("munmap(%p, %u): %s\n", addr, size, strerror(errno));
 131    }
 132}
 133
 134static void* call_mremap(void* addr, unsigned old_nbytes, unsigned new_nbytes, bool clean)
 135/*
 136 * old/new_nbytes are unaligned
 137 */
 138{
 139    unsigned old_size = align_unsigned_to_page(old_nbytes);
 140    unsigned new_size = align_unsigned_to_page(new_nbytes);
 141    if (new_size == old_size) {
 142        if (clean && new_nbytes > old_nbytes) {
 143            cleanse(addr, old_nbytes, new_nbytes);
 144        }
 145        return addr;
 146    }
 147    int flags;
 148    if (new_size > old_size) {
 149        flags = MREMAP_MAYMOVE;
 150    } else {
 151        flags = 0;
 152        clean = false;  // don't clean when shrinking
 153    }
 154    void* new_addr = mremap(addr, old_size, new_size, flags);
 155    if (new_addr == MAP_FAILED) {
 156        ERR("mremap(%p, %u, %u): %s\n", addr, old_size, new_size, strerror(errno));
 157        if (new_size > old_size) {
 158            // grow failed
 159            return nullptr;
 160        } else {
 161            // shrink failed, return same address
 162            return addr;
 163        }
 164    }
 165    if (clean) {
 166        cleanse(new_addr, old_nbytes, new_nbytes);
 167    }
 168    return new_addr;
 169}
 170
 171/****************************************************************
 172 * bitmap allocator parameters
 173 */
 174
 175static unsigned units_per_page;  // sys_page_size / UNIT_SIZE
 176
 177static unsigned bm_page_header_size_in_units; // num_reserved_units ???
 178/*
 179 * Calculated as: (offsetof(BmPageHeader, bitmap)
 180 *                 + units_per_page / 8  // size of bitmap in bytes
 181 *                 + UNIT_SIZE - 1       // rounding
 182 *                ) / UNIT_SIZE
 183 */
 184
 185static unsigned max_data_units;  // sys_page_size - bm_page_header_size_in_units
 186
 187
 188static inline unsigned bytes_to_units(unsigned nbytes)
 189{
 190    return align_unsigned(nbytes, UNIT_SIZE) / UNIT_SIZE;
 191}
 192
 193/****************************************************************
 194 * Bitmap allocator data page and superblock
 195 */
 196
 197typedef struct _BmPageHeader {
 198    /*
 199     * On 4K page the header takes four 16-byte units, leaving 4032 bytes for data.
 200     */
 201    struct _BmPageHeader** volatile list;
 202    struct _BmPageHeader* next;
 203    struct _BmPageHeader* prev;
 204
 205    // variable part
 206
 207    // the size of bitmap depends on page size, for 4K it takes 32 bytes
 208    Word bitmap[ /* sys_page_size() / UNIT_SIZE / WORD_WIDTH */ ];
 209
 210} BmPageHeader;
 211
 212
 213static BmPageHeader* volatile lru_page = nullptr;
 214/*
 215 * last recently used page.
 216 *
 217 * Technically, this is a list, and functions that grab a page
 218 * do not make a difference between this variable and superblock entry.
 219 * However, LRU may contain single item only.
 220 */
 221
 222static BmPageHeader** volatile superblock;
 223/*
 224 * Straightforward definition would be:
 225 *
 226 * BmPageHeader* superblock[ units_per_page ];
 227 *
 228 * The array contains pointers to bm_page lists grouped by their
 229 * longest free block.
 230 *
 231 * Superblock cannot take more than one page.
 232 * Usually it takes a half, if UNIT_SIZE is twice longer than size of pointer.
 233 */
 234
 235static void dump_bm_page(BmPageHeader* bm_page)
 236{
 237    fprintf(stderr, "Page %p: list=%p, next=%p, prev=%p\n",
 238            (void*) bm_page, (void*) bm_page->list, (void*) bm_page->next, (void*) bm_page->prev);
 239    dump_bitmap(stderr, (uint8_t*)(bm_page->bitmap), units_per_page / 8);
 240}
 241
 242static void dump()
 243{
 244    BmPageHeader** list = superblock;
 245    fprintf(stderr, "\nAllocator bm pages: %zu, blocks allocated %zu\n",
 246            num_bm_pages, stats.blocks_allocated);
 247    if (lru_page) {
 248        fprintf(stderr, "LRU page: %p\n", (void*) lru_page);
 249        dump_bm_page(lru_page);
 250    }
 251    for (unsigned i = 0; i < units_per_page; i++, list++) {
 252        BmPageHeader* first_page = *list;
 253        if (first_page) {
 254            fprintf(stderr, "Superblock entry %u: %p -> %p\n", i, (void*) list, (void*) first_page);
 255            BmPageHeader* bm_page = first_page;
 256            do {
 257                dump_bm_page(bm_page);
 258                bm_page = bm_page->next;
 259            } while (bm_page != first_page);
 260        }
 261    }
 262    fputc('\n', stderr);
 263}
 264
 265/****************************************************************
 266 * Basic bitmap functions
 267 */
 268
 269static unsigned count_zero_bits(BmPageHeader* bm_page, unsigned offset, unsigned limit)
 270/*
 271 * Count consecutive zero bits in the bitmap starting from `offset` bit
 272 * up to `limit`. The limit is treated as a hint when to stop, returned count can be greater.
 273 */
 274{
 275    unsigned count = 0;
 276    Word* ptr = &bm_page->bitmap[offset / WORD_WIDTH];
 277
 278    // count starting bits up to the the next word boundary
 279    unsigned bit_index = offset & (WORD_WIDTH - 1);
 280    if (bit_index) {
 281        Word w = *ptr++;
 282        w >>= bit_index;
 283        if (w) {
 284            // we have only ending bits
 285            return stdc_trailing_zeros(w);
 286        }
 287        count = WORD_WIDTH - bit_index;
 288        offset += count;
 289    }
 290
 291    // count zero words
 292    while (offset < units_per_page && count < limit) {
 293        Word w = *ptr++;
 294        if (w) {
 295            // count ending bits
 296            count += stdc_trailing_zeros(w);
 297            break;
 298        }
 299        count += WORD_WIDTH;
 300        offset += WORD_WIDTH;
 301    }
 302    return count;
 303}
 304
 305static unsigned count_nonzero_bits(BmPageHeader* bm_page, unsigned offset, unsigned limit)
 306/*
 307 * Count consecutive nonzero bits in the bitmap starting from `offset` bit
 308 * up to `limit`. The limit is treated as a hint when to stop, returned count can be greater.
 309 *
 310 * The code is exactly the same as in count_zero_bits, the only difference is inversion.
 311 */
 312{
 313    unsigned count = 0;
 314    Word* ptr = &bm_page->bitmap[offset / WORD_WIDTH];
 315
 316    // count starting bits up to the the next word boundary
 317    unsigned bit_index = offset & (WORD_WIDTH - 1);
 318    if (bit_index) {
 319        Word w = ~*ptr++;
 320        w >>= bit_index;
 321        if (w) {
 322            // we have only ending bits
 323            return stdc_trailing_zeros(w);
 324        }
 325        count = WORD_WIDTH - bit_index;
 326        offset += count;
 327    }
 328
 329    // count all-one words
 330    while (offset < units_per_page && count < limit) {
 331        Word w = ~*ptr++;
 332        if (w) {
 333            // count ending bits
 334            count += stdc_trailing_zeros(w);
 335            break;
 336        }
 337        count += WORD_WIDTH;
 338        offset += WORD_WIDTH;
 339    }
 340    return count;
 341}
 342
 343static void set_bits(BmPageHeader* bm_page, unsigned offset, unsigned length)
 344/*
 345 * Set bits in the bitmap starting from offset.
 346 */
 347{
 348    TRACE("bm_page=%p offset=%u length=%u\n", (void*) bm_page, offset, length);
 349    Word* ptr = &bm_page->bitmap[offset / WORD_WIDTH];
 350
 351    // set starting bits up to the the next word boundary
 352    unsigned bit_index = offset & (WORD_WIDTH - 1);
 353    if (bit_index) {
 354        Word bitmask = WORD_MAX;
 355        unsigned num_bits = WORD_WIDTH - bit_index;
 356        if (length <= num_bits) {
 357            bitmask &= (((Word) 1) << length) - 1;
 358            num_bits = length;
 359        }
 360        bitmask <<= bit_index;
 361        *ptr++ |= bitmask;
 362        offset += num_bits;
 363        length -= num_bits;
 364    }
 365
 366    // set remaining words
 367    while (length >= WORD_WIDTH) {
 368        *ptr++ = WORD_MAX;
 369        length -= WORD_WIDTH;
 370    }
 371
 372    // set ending bits
 373    if (length) {
 374        *ptr++ |= (((Word) 1) << length) - 1;
 375    }
 376}
 377
 378static void clear_bits(BmPageHeader* bm_page, unsigned offset, unsigned length)
 379/*
 380 * Clear bits in the bitmap starting from offset.
 381 *
 382 * The logic is the same as in set_bits.
 383 */
 384{
 385    TRACE("bm_page=%p offset=%u length=%u\n", (void*) bm_page, offset, length);
 386    Word* ptr = &bm_page->bitmap[offset / WORD_WIDTH];
 387
 388    // clear starting bits up to the the next word boundary
 389    unsigned bit_index = offset & (WORD_WIDTH - 1);
 390    if (bit_index) {
 391        Word bitmask = WORD_MAX;
 392        unsigned num_bits = WORD_WIDTH - bit_index;
 393        if (length <= num_bits) {
 394            bitmask &= (((Word) 1) << length) - 1;
 395            num_bits = length;
 396        }
 397        bitmask <<= bit_index;
 398        *ptr++ &= ~bitmask;
 399        offset += num_bits;
 400        length -= num_bits;
 401    }
 402
 403    // clear remaining words
 404    while (length >= WORD_WIDTH) {
 405        *ptr++ = 0;
 406        length -= WORD_WIDTH;
 407    }
 408
 409    // clear ending bits
 410    if (length) {
 411        *ptr++ &= ~((((Word) 1) << length) - 1);
 412    }
 413}
 414
 415/****************************************************************
 416 * Bitmap allocator functions
 417 */
 418
 419static unsigned find_free_block(BmPageHeader* bm_page, unsigned block_size)
 420/*
 421 * Search for free block.
 422 * Return offset of the first available block or 0 if no block is found.
 423 * Given that first units of bm_page are always in use,
 424 * offset can never be zero on success.
 425 */
 426{
 427    unsigned offset = bm_page_header_size_in_units;
 428    while (offset < units_per_page) {
 429        unsigned length = count_zero_bits(bm_page, offset, block_size);
 430        if (length >= block_size) {
 431            TRACE("bm_page=%p block_size=%u -> offset=%u\n", (void*) bm_page, block_size, offset);
 432            return offset;
 433        }
 434        offset += length;
 435        offset += count_nonzero_bits(bm_page, offset, UINT_MAX);
 436    }
 437    TRACE("bm_page=%p block_size=%u -> 0\n", (void*) bm_page, block_size);
 438    return 0;
 439}
 440
 441static unsigned find_longest_free_block(BmPageHeader* bm_page)
 442/*
 443 * Search for the longest sequence of zero bits and return its length.
 444 */
 445{
 446    unsigned offset = bm_page_header_size_in_units;
 447    unsigned n = max_data_units;
 448    unsigned lfb = 0;
 449    while (n) {
 450        unsigned length = count_zero_bits(bm_page, offset, n);
 451        if (length > lfb) {
 452            lfb = length;
 453        }
 454        offset += length;
 455        n -= length;
 456
 457        length = count_nonzero_bits(bm_page, offset, n);
 458        offset += length;
 459        n -= length;
 460    }
 461    TRACE("bm_page=%p -> lfb=%u\n", (void*) bm_page, lfb);
 462    return lfb;
 463}
 464
 465static void add_to_list(BmPageHeader** list, BmPageHeader* bm_page)
 466/*
 467 * Add bm_page to circular doubly-linked list.
 468 */
 469{
 470    BmPageHeader* first = *list;
 471    if (first) {
 472        // add to the end of list
 473        bm_page->prev = first->prev;
 474        bm_page->next = first;
 475        first->prev->next = bm_page;
 476        first->prev = bm_page;
 477    } else {
 478        // init list
 479        *list = bm_page->next = bm_page->prev = bm_page;
 480    }
 481    bm_page->list = list;
 482}
 483
 484static void delete_from_list(BmPageHeader* bm_page)
 485/*
 486 * Delete bm_page from circular doubly-linked list.
 487 */
 488{
 489    BmPageHeader** list = bm_page->list;
 490
 491#   ifdef DEBUG
 492        if (!list) {
 493            ERR("double call delete_from_list(%p)\n", (void*) bm_page);
 494            abort();
 495        }
 496        if (list == &lru_page) {
 497            TRACE("deleting page %p from LRU\n", (void*) bm_page);
 498        } else {
 499            TRACE("deleting page %p from superblock[%tu]\n", (void*) bm_page, list - superblock);
 500        }
 501#   endif
 502
 503    if (bm_page->next == bm_page->prev) {
 504        // last page, make list empty
 505        *list = nullptr;
 506    } else {
 507        if (*list == bm_page) {
 508            *list = bm_page->next;
 509        }
 510        bm_page->next->prev = bm_page->prev;
 511        bm_page->prev->next = bm_page->next;
 512    }
 513    bm_page->list = nullptr;
 514}
 515
 516static void add_to_superblock_entry(BmPageHeader* bm_page, unsigned lfb)
 517/*
 518 * Add page to superblock entry by `lfb`.
 519 */
 520{
 521    mtx_lock(&lock);
 522    TRACE("adding page %p to superblock[%tu]\n", (void*) bm_page, lfb);
 523    add_to_list(&superblock[lfb], bm_page);
 524    mtx_unlock(&lock);
 525}
 526
 527static inline void unhand_page(BmPageHeader* bm_page)
 528/*
 529 * If `lru_page` is set, move it to superblock.
 530 * Assign `bm_page` to `lru_page`.
 531 */
 532{
 533    BmPageHeader* page_to_reclaim = nullptr;
 534
 535    mtx_lock(&lock);
 536    if (lru_page) {
 537        // have to scan while lock is held
 538        unsigned lfb = find_longest_free_block(lru_page);
 539
 540        if (lfb < max_data_units) {
 541            TRACE("adding lru_page %p to superblock[%tu]\n", (void*) lru_page, lfb);
 542            add_to_list(&superblock[lfb], lru_page);
 543        } else {
 544            // okay to reclaim this page
 545            page_to_reclaim = lru_page;
 546        }
 547    }
 548    // now set lru_page
 549    TRACE("adding bm_page %p to LRU\n", (void*) bm_page);
 550    bm_page->list = (BmPageHeader** /* make compiller happy with volatile */) &lru_page;
 551    lru_page = bm_page->next = bm_page->prev = bm_page;
 552    mtx_unlock(&lock);
 553
 554    if (page_to_reclaim) {
 555        TRACE("releasing page %p\n", (void*) page_to_reclaim);
 556        call_munmap(page_to_reclaim, sys_page_size());
 557        atomic_fetch_sub(&num_bm_pages, 1);
 558    }
 559}
 560
 561static inline BmPageHeader* bm_page_by_addr(void* addr)
 562/*
 563 * Get address of the bm_page from `addr`
 564 */
 565{
 566    return (BmPageHeader*) (
 567        ((ptrdiff_t) addr) & ~((ptrdiff_t) sys_page_size() - 1)
 568    );
 569}
 570
 571static void grab_page(BmPageHeader* bm_page)
 572/*
 573 * Prepare page for reallocating units
 574 * in the context of current thread.
 575 * The page is obtained by address using `bm_page_by_addr`.
 576 *
 577 * Take the page out of whatever list it belongs,
 578 * either superblock or LRU.
 579 */
 580{
 581#   if DEBUG
 582        unsigned msg_interval = 0;
 583#   endif
 584
 585    for (;;) {
 586        mtx_lock(&lock);
 587        if (bm_page->list) {
 588            // the page belongs to some list
 589            break;
 590        }
 591        // page is probably in use by other thread
 592#       if DEBUG
 593            if (msg_interval == 0) {
 594                TRACE("waiting for page %p to be released\n", (void*) bm_page);
 595                msg_interval = 1000;
 596            } else {
 597                msg_interval--;
 598            }
 599#       endif
 600        mtx_unlock(&lock);
 601        thrd_yield();
 602    }
 603#   if DEBUG
 604        if (bm_page->list == &lru_page) {
 605            TRACE("taking page %p out of LRU\n", (void*) bm_page);
 606        } else {
 607            TRACE("taking page %p out of superblock[%tu]\n", (void*) bm_page, bm_page->list - superblock);
 608        }
 609#   endif
 610    delete_from_list(bm_page);
 611    mtx_unlock(&lock);
 612}
 613
 614static inline unsigned ptrdiff_to_units(void* addr, BmPageHeader* bm_page)
 615// helper function for bm_shrink and bm_release invocation
 616{
 617    return (
 618        ((uint8_t*) addr) - ((uint8_t*) bm_page)
 619    ) / UNIT_SIZE;
 620}
 621
 622#ifdef DEBUG
 623    static void check_units_allocated(const char* func, BmPageHeader* bm_page,
 624                                      unsigned offset, unsigned num_units)
 625    {
 626        unsigned n = count_nonzero_bits(bm_page, offset, num_units);
 627        if (n < num_units) {
 628            print_msg(func, "already released some units on bm_page %p starting from %u: in use %u of %u\n",
 629                      (void*) bm_page, offset, n, num_units);
 630        }
 631    }
 632#endif
 633
 634static BmPageHeader* find_available_page(unsigned num_units, unsigned* offset)
 635/*
 636 * Find available page for new allocation.
 637 *
 638 * First, get LRU page. If missing, search superblock lists.
 639 *
 640 * When found, the page is removed either form LRU or from the superblock,
 641 * so only current thread continue working with it while other threads
 642 * can work with their own pages in parallel.
 643 */
 644{
 645    BmPageHeader* bm_page = nullptr;
 646
 647    mtx_lock(&lock);
 648
 649    if (lru_page) {
 650        bm_page = lru_page;
 651        TRACE("taking page %p out of LRU\n", (void*) bm_page);
 652        delete_from_list(bm_page);
 653        mtx_unlock(&lock);
 654
 655        // find free block on the LRU page
 656        *offset = find_free_block(bm_page, num_units);
 657        if (*offset) {
 658            return bm_page;
 659        }
 660        // LRU page has no space available, move it to superblock
 661        add_to_superblock_entry(bm_page, find_longest_free_block(bm_page));
 662
 663        // continue with superblock
 664        mtx_lock(&lock);
 665    }
 666    // XXX optimize for speed with bitmap?
 667    // start searching from num_units position
 668    BmPageHeader** list = &superblock[num_units];
 669    unsigned lfb = num_units;
 670    for (; lfb <= max_data_units; lfb++) {
 671        bm_page = *list++;
 672        if (bm_page) {
 673            TRACE("taking page %p out of superblock[%tu]\n", (void*) bm_page, bm_page->list - superblock);
 674            delete_from_list(bm_page);
 675            goto found;
 676        }
 677    }
 678    mtx_unlock(&lock);
 679    return nullptr;
 680
 681found:
 682    mtx_unlock(&lock);
 683    *offset = find_free_block(bm_page, num_units);
 684    if (*offset == 0) {
 685        ERR("bm_page %p with LFB=%u must contain enough free space for %u units\n",
 686            (void*) bm_page, bm_page->list - superblock, num_units);
 687        abort();
 688    }
 689    return bm_page;
 690}
 691
 692static void* bm_allocate(unsigned num_units, bool clean)
 693/*
 694 * Bitmap sub-allocator, should be called with num_units < max_data_units
 695 */
 696{
 697    TRACE("num_units %u\n", num_units);
 698
 699    void* result = nullptr;
 700    unsigned offset;
 701    BmPageHeader* bm_page = find_available_page(num_units, &offset);
 702    if (bm_page) {
 703        // allocate
 704        set_bits(bm_page, offset, num_units);
 705        unhand_page(bm_page);
 706        result = ((uint8_t*) bm_page) + offset * UNIT_SIZE;
 707        goto out;
 708    }
 709
 710    TRACE("allocating new page\n");
 711
 712    bm_page = call_mmap(sys_page_size(), false);
 713    if (!bm_page) {
 714        goto out;
 715    }
 716    // clean bitmap
 717    Word* ptr = bm_page->bitmap;
 718    for (unsigned i = 0, n = units_per_page / WORD_WIDTH; i < n; i++) {
 719        *ptr++ = 0;
 720    }
 721    // mark reserved units and allocate units
 722    set_bits(bm_page, 0, bm_page_header_size_in_units + num_units);
 723
 724    // give page away to LRU or superblock
 725    unhand_page(bm_page);
 726
 727    atomic_fetch_add(&num_bm_pages, 1);
 728    result = ((uint8_t*) bm_page) + bm_page_header_size_in_units * UNIT_SIZE;
 729
 730out:
 731    if (result) {
 732        atomic_fetch_add(&stats.blocks_allocated, 1);
 733    }
 734
 735    if (result && clean) {
 736        cleanse(result, 0, num_units * UNIT_SIZE);
 737    }
 738    TRACE("result=%p\n", result);
 739    return result;
 740}
 741
 742static void bm_shrink(BmPageHeader* bm_page, unsigned offset, unsigned old_num_units, unsigned new_num_units)
 743{
 744    TRACE("bm_page=%p, offset=%u, old_num_units=%u, new_num_units=%u\n",
 745          (void*) bm_page, offset, old_num_units, new_num_units);
 746
 747    grab_page(bm_page);
 748
 749    unsigned tail_units = old_num_units - new_num_units;
 750
 751#   ifdef DEBUG
 752        check_units_allocated(__func__, bm_page, offset + new_num_units, tail_units);
 753#   endif
 754    clear_bits(bm_page, offset + new_num_units, tail_units);
 755
 756    unhand_page(bm_page);
 757}
 758
 759static bool bm_grow(BmPageHeader* bm_page, unsigned offset, unsigned old_num_units, unsigned new_num_units)
 760{
 761    TRACE("bm_page=%p, offset=%u, old_num_units=%u, new_num_units=%u\n",
 762          (void*) bm_page, offset, old_num_units, new_num_units);
 763
 764    grab_page(bm_page);
 765
 766    unsigned increment = new_num_units - old_num_units;
 767    unsigned length = count_zero_bits(bm_page, offset + old_num_units, increment);
 768    if (length < increment) {
 769        TRACE("available length %u is less than increment %u; need to move\n", length, increment);
 770        unhand_page(bm_page);
 771        return false;
 772    }
 773    set_bits(bm_page, offset + old_num_units, increment);
 774
 775    unhand_page(bm_page);
 776    return true;
 777}
 778
 779static void bm_release(BmPageHeader* bm_page, unsigned offset, unsigned num_units)
 780{
 781    TRACE("bm_page=%p, offset=%u, num_units=%u\n", (void*) bm_page, offset, num_units);
 782
 783    grab_page(bm_page);
 784
 785#   ifdef DEBUG
 786        check_units_allocated(__func__, bm_page, offset, num_units);
 787#   endif
 788    clear_bits(bm_page, offset, num_units);
 789
 790    unhand_page(bm_page);
 791    atomic_fetch_sub(&stats.blocks_allocated, 1);
 792}
 793
 794/****************************************************************
 795 * Allocator interface functions
 796 */
 797
 798static void _init()
 799{
 800    // init page parameters
 801
 802    units_per_page = sys_page_size() / UNIT_SIZE;
 803
 804    bm_page_header_size_in_units = (
 805        offsetof(BmPageHeader, bitmap)
 806        + units_per_page / 8  // size of bitmap in bytes
 807        + UNIT_SIZE - 1       // rounding
 808    ) / UNIT_SIZE;
 809
 810    max_data_units = units_per_page - bm_page_header_size_in_units;
 811
 812    // allocate superblock
 813
 814    superblock = call_mmap(sys_page_size(), true);
 815    if (!superblock) {
 816        abort();
 817    }
 818
 819    // init mutex
 820    if (mtx_init(&lock, mtx_plain) != thrd_success) {
 821        ERR("cannot init mutex\n");
 822    }
 823
 824    SAY("page size %u; units per page: %u; header: %u units; data units: %u (%u bytes)\n",
 825        sys_page_size(), units_per_page, bm_page_header_size_in_units, max_data_units, max_data_units * UNIT_SIZE);
 826}
 827
 828
 829static void* _allocate(unsigned nbytes, bool clean)
 830{
 831    TRACE("nbytes=%u\n", nbytes);
 832
 833    if (nbytes == 0) {
 834        return nullptr;
 835    }
 836    unsigned num_units = bytes_to_units(nbytes);
 837    if (num_units < max_data_units) {
 838        // use bitmap sub-allocator for smaller blocks
 839        return bm_allocate(num_units, clean);
 840    } else {
 841        // allocate pages directly
 842        void* result = call_mmap(align_unsigned_to_page(nbytes), clean);
 843        if (result) {
 844            atomic_fetch_add(&stats.blocks_allocated, 1);
 845        }
 846        return result;
 847    }
 848}
 849
 850static void _release(void** addr_ptr, unsigned nbytes)
 851{
 852    void* addr = *addr_ptr;
 853    if (!addr) {
 854        return;
 855    }
 856
 857    TRACE("addr=%p nbytes=%u\n", addr, nbytes);
 858
 859    if (nbytes == 0) {
 860        ERR("called for %p with zero nbytes\n", addr);
 861        abort();
 862    }
 863
 864    BmPageHeader* bm_page = bm_page_by_addr(addr);
 865    if (addr == (void*) bm_page) {
 866        /*
 867         * addr is aligned on page boundary, this means
 868         * the block was allocated directly with mmap
 869         */
 870        call_munmap(addr, align_unsigned_to_page(nbytes));
 871        atomic_fetch_sub(&stats.blocks_allocated, 1);
 872
 873    } else {
 874        // use bitmap sub-allocator for smaller blocks
 875        bm_release(bm_page, ptrdiff_to_units(addr, bm_page), bytes_to_units(nbytes));
 876    }
 877    *addr_ptr = nullptr;
 878}
 879
 880static bool _reallocate(void** addr_ptr, unsigned old_nbytes, unsigned new_nbytes, bool clean, bool* addr_changed)
 881{
 882    if (old_nbytes == new_nbytes) {
 883        goto success_same_addr;
 884    }
 885
 886    void* addr = *addr_ptr;
 887
 888    TRACE("addr=%p old_nbytes=%u new_nbytes=%u\n", addr, old_nbytes, new_nbytes);
 889
 890    // shall we allocate new addr?
 891    if (addr == nullptr) {
 892        if (old_nbytes != 0) {
 893            goto error;
 894        }
 895        addr = _allocate(new_nbytes, clean);
 896        if (!addr) {
 897            goto error;
 898        }
 899        *addr_ptr = addr;
 900        goto success_changed_addr;
 901    }
 902
 903    if (!(old_nbytes | new_nbytes)) {
 904        // XXX might be a serious error, but it's a caller's problem
 905        if (!old_nbytes) { ERR("called for %p with zero old_nbytes\n", addr); }
 906        if (!new_nbytes) { ERR("called for %p with zero new_nbytes\n", addr); }
 907        goto error;
 908    }
 909
 910    unsigned new_num_units = bytes_to_units(new_nbytes);
 911    unsigned old_num_units = bytes_to_units(old_nbytes);
 912
 913    if (new_num_units == old_num_units) {
 914        if (clean && new_nbytes > old_nbytes) {
 915            cleanse(addr, old_nbytes, new_nbytes);
 916        }
 917        goto success_same_addr;
 918    }
 919
 920    BmPageHeader* bm_page = bm_page_by_addr(addr);
 921
 922    // shall we shrink?
 923    if (new_num_units < old_num_units) {
 924
 925        if (new_num_units < max_data_units) {
 926
 927            // new block will use bitmap sub-allocator
 928
 929            if (old_num_units < max_data_units) {
 930                // shrink using bitmap sub-allocator
 931                if (addr == (void*) bm_page) {
 932                    ERR("address %p is not within data area\n", addr);
 933                    abort();
 934                }
 935                bm_shrink(bm_page, ptrdiff_to_units(addr, bm_page), old_num_units, new_num_units);
 936                goto success_same_addr;
 937            }
 938
 939            // shrinking block from page allocator to bitmap sub-allocator
 940
 941            if (addr != (void*) bm_page) {
 942                ERR("address %p is not aligned on page boundary\n", addr);
 943                abort();
 944            }
 945            void* new_block = bm_allocate(new_num_units, false);
 946            if (!new_block) {
 947                TRACE("falling back to remap\n");
 948                goto remap;
 949            }
 950            memcpy(new_block, addr, new_nbytes);
 951            bm_release(bm_page, ptrdiff_to_units(addr, bm_page), new_num_units);
 952            *addr_ptr = new_block;
 953            goto success_changed_addr;
 954
 955        } else {
 956            // shrink using mremap
 957            if (addr != (void*) bm_page) {
 958                ERR("address %p is not aligned on page boundary\n", addr);
 959                abort();
 960            }
 961    remap:
 962            call_mremap(addr, old_nbytes, new_nbytes, false);
 963            goto success_same_addr;
 964        }
 965    }
 966
 967    // grow
 968
 969    if (old_num_units < max_data_units) {
 970
 971        if (new_num_units < max_data_units) {
 972
 973            // grow using bitmap sub-allocator
 974
 975            if (addr == (void*) bm_page) {
 976                ERR("address %p is not within data area\n", addr);
 977                abort();
 978            }
 979            // try to grow within the same page
 980            if(bm_grow(bm_page, ptrdiff_to_units(addr, bm_page), old_num_units, new_num_units)) {
 981                if (clean) {
 982                    cleanse(addr, old_nbytes, new_nbytes);
 983                }
 984                goto success_same_addr;
 985            }
 986        }
 987
 988        // reallocate block
 989
 990        void* new_block = _allocate(new_nbytes, false);
 991        if (!new_block) {
 992            goto error;
 993        }
 994        memcpy(new_block, addr, old_nbytes);
 995        _release(&addr, old_nbytes);
 996        if (clean) {
 997            cleanse(new_block, old_nbytes, new_nbytes);
 998        }
 999        *addr_ptr = new_block;
1000        if (addr_changed) { *addr_changed = new_block != addr; }
1001        return true;
1002
1003    } else {
1004        // grow using mremap
1005        if (addr != (void*) bm_page) {
1006            ERR("address %p is not aligned on page boundary\n", addr);
1007            abort();
1008        }
1009        void* new_addr = call_mremap(addr, old_nbytes, new_nbytes, clean);
1010        if (!new_addr) {
1011            goto error;
1012        }
1013        *addr_ptr = new_addr;
1014        if (addr_changed) { *addr_changed = new_addr != addr; }
1015        return true;
1016    }
1017
1018success_changed_addr:
1019    if (addr_changed) { *addr_changed = true; }
1020    return true;
1021
1022success_same_addr:
1023    if (addr_changed) { *addr_changed = false; }
1024    return true;
1025
1026error:
1027    if (addr_changed) { *addr_changed = false; }
1028    return false;
1029}
1030
1031Allocator pet_allocator = {
1032    .init       = _init,
1033    .allocate   = _allocate,
1034    .reallocate = _reallocate,
1035    .release    = _release,
1036    .dump       = dump,
1037    .trace      = false,
1038    .verbose    = false,
1039    .stats      = &stats
1040};