Browse Source

replace linux bitmask with bit_array

Semyon Verchenko 8 years ago
parent
commit
16088ae4f1
12 changed files with 4143 additions and 142 deletions
  1. 7 2
      Makefile.am
  2. 1 0
      balance.c
  3. 3159 0
      bit_array.c
  4. 552 0
      bit_array.h
  5. 205 0
      bit_macros.h
  6. 4 0
      cpu.c
  7. 42 139
      cpumask.h
  8. 137 0
      hexio.c
  9. 17 0
      hexio.h
  10. 11 1
      irq.c
  11. 4 0
      numa.c
  12. 4 0
      pxm.c

+ 7 - 2
Makefile.am

@@ -23,7 +23,10 @@ noinst_HEADERS = \
 	numa.h \
 	statistics.h \
 	balance.h \
-	pxm.h
+	pxm.h \
+	bit_array.h \
+	bit_macros.h \
+	hexio.h
 
 birq_SOURCES = \
 	birq.c \
@@ -33,7 +36,9 @@ birq_SOURCES = \
 	numa.c \
 	statistics.c \
 	balance.c \
-	pxm.c
+	pxm.c \
+	bit_array.c \
+	hexio.c
 
 birq_LDADD = liblub.a
 birq_DEPENDENCIES = liblub.a

+ 1 - 0
balance.c

@@ -171,6 +171,7 @@ int balance(lub_list_t *cpus, lub_list_t *balance_irqs, float load_limit)
 		   be held by QPI-like interfaces through local CPUs. */
 /*		if (!cpu) {
 			cpumask_t complement;
+			cpus_init(complement);
 			cpus_complement(complement, irq->local_cpus);
 			cpu = choose_cpu(cpus, &complement, load_limit);
 		}

+ 3159 - 0
bit_array.c

@@ -0,0 +1,3159 @@
+/*
+ bit_array.c
+ project: bit array C library
+ url: https://github.com/noporpoise/BitArray/
+ maintainer: Isaac Turner <turner.isaac@gmail.com>
+ license: Public Domain, no warranty
+ date: Aug 2014
+*/
+
+// 64 bit words
+// Array length can be zero
+// Unused top bits must be zero
+
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <limits.h> // ULONG_MAX
+#include <errno.h>
+#include <signal.h> // needed for abort()
+#include <string.h> // memset()
+#include <assert.h>
+#include <time.h> // needed for seeding rand()
+#include <unistd.h>  // need for getpid() for seeding rand number
+#include <ctype.h>  // need for tolower()
+#include <errno.h>  // perror()
+#include <sys/time.h> // for seeding random
+
+// Windows includes
+#if defined(_WIN32)
+#include <intrin.h>
+#endif
+
+#include "bit_array.h"
+#include "bit_macros.h"
+
+//
+// Tables of constants
+//
+
+// byte reverse look up table
+static const word_t reverse_table[256] =
+{
+  0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0,
+  0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
+  0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8,
+  0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
+  0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4,
+  0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
+  0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC,
+  0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
+  0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2,
+  0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
+  0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA,
+  0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
+  0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6,
+  0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
+  0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE,
+  0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
+  0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1,
+  0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
+  0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9,
+  0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
+  0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5,
+  0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
+  0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED,
+  0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
+  0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3,
+  0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
+  0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB,
+  0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
+  0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7,
+  0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
+  0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF,
+  0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF,
+};
+
+// Morton table for interleaving bytes
+static const word_t morton_table0[256] =
+{
+  0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015,
+  0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055,
+  0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115,
+  0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155,
+  0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415,
+  0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455,
+  0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515,
+  0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555,
+  0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015,
+  0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055,
+  0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115,
+  0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155,
+  0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415,
+  0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455,
+  0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515,
+  0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555,
+  0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015,
+  0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055,
+  0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115,
+  0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155,
+  0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415,
+  0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455,
+  0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515,
+  0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555,
+  0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015,
+  0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055,
+  0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115,
+  0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155,
+  0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415,
+  0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455,
+  0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515,
+  0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555,
+};
+
+// Morton table for interleaving bytes, shifted left 1 bit
+static const word_t morton_table1[256] =
+{
+  0x0000, 0x0002, 0x0008, 0x000A, 0x0020, 0x0022, 0x0028, 0x002A,
+  0x0080, 0x0082, 0x0088, 0x008A, 0x00A0, 0x00A2, 0x00A8, 0x00AA,
+  0x0200, 0x0202, 0x0208, 0x020A, 0x0220, 0x0222, 0x0228, 0x022A,
+  0x0280, 0x0282, 0x0288, 0x028A, 0x02A0, 0x02A2, 0x02A8, 0x02AA,
+  0x0800, 0x0802, 0x0808, 0x080A, 0x0820, 0x0822, 0x0828, 0x082A,
+  0x0880, 0x0882, 0x0888, 0x088A, 0x08A0, 0x08A2, 0x08A8, 0x08AA,
+  0x0A00, 0x0A02, 0x0A08, 0x0A0A, 0x0A20, 0x0A22, 0x0A28, 0x0A2A,
+  0x0A80, 0x0A82, 0x0A88, 0x0A8A, 0x0AA0, 0x0AA2, 0x0AA8, 0x0AAA,
+  0x2000, 0x2002, 0x2008, 0x200A, 0x2020, 0x2022, 0x2028, 0x202A,
+  0x2080, 0x2082, 0x2088, 0x208A, 0x20A0, 0x20A2, 0x20A8, 0x20AA,
+  0x2200, 0x2202, 0x2208, 0x220A, 0x2220, 0x2222, 0x2228, 0x222A,
+  0x2280, 0x2282, 0x2288, 0x228A, 0x22A0, 0x22A2, 0x22A8, 0x22AA,
+  0x2800, 0x2802, 0x2808, 0x280A, 0x2820, 0x2822, 0x2828, 0x282A,
+  0x2880, 0x2882, 0x2888, 0x288A, 0x28A0, 0x28A2, 0x28A8, 0x28AA,
+  0x2A00, 0x2A02, 0x2A08, 0x2A0A, 0x2A20, 0x2A22, 0x2A28, 0x2A2A,
+  0x2A80, 0x2A82, 0x2A88, 0x2A8A, 0x2AA0, 0x2AA2, 0x2AA8, 0x2AAA,
+  0x8000, 0x8002, 0x8008, 0x800A, 0x8020, 0x8022, 0x8028, 0x802A,
+  0x8080, 0x8082, 0x8088, 0x808A, 0x80A0, 0x80A2, 0x80A8, 0x80AA,
+  0x8200, 0x8202, 0x8208, 0x820A, 0x8220, 0x8222, 0x8228, 0x822A,
+  0x8280, 0x8282, 0x8288, 0x828A, 0x82A0, 0x82A2, 0x82A8, 0x82AA,
+  0x8800, 0x8802, 0x8808, 0x880A, 0x8820, 0x8822, 0x8828, 0x882A,
+  0x8880, 0x8882, 0x8888, 0x888A, 0x88A0, 0x88A2, 0x88A8, 0x88AA,
+  0x8A00, 0x8A02, 0x8A08, 0x8A0A, 0x8A20, 0x8A22, 0x8A28, 0x8A2A,
+  0x8A80, 0x8A82, 0x8A88, 0x8A8A, 0x8AA0, 0x8AA2, 0x8AA8, 0x8AAA,
+  0xA000, 0xA002, 0xA008, 0xA00A, 0xA020, 0xA022, 0xA028, 0xA02A,
+  0xA080, 0xA082, 0xA088, 0xA08A, 0xA0A0, 0xA0A2, 0xA0A8, 0xA0AA,
+  0xA200, 0xA202, 0xA208, 0xA20A, 0xA220, 0xA222, 0xA228, 0xA22A,
+  0xA280, 0xA282, 0xA288, 0xA28A, 0xA2A0, 0xA2A2, 0xA2A8, 0xA2AA,
+  0xA800, 0xA802, 0xA808, 0xA80A, 0xA820, 0xA822, 0xA828, 0xA82A,
+  0xA880, 0xA882, 0xA888, 0xA88A, 0xA8A0, 0xA8A2, 0xA8A8, 0xA8AA,
+  0xAA00, 0xAA02, 0xAA08, 0xAA0A, 0xAA20, 0xAA22, 0xAA28, 0xAA2A,
+  0xAA80, 0xAA82, 0xAA88, 0xAA8A, 0xAAA0, 0xAAA2, 0xAAA8, 0xAAAA,
+};
+
+//
+// Macros
+//
+
+// WORD_SIZE is the number of bits per word
+// sizeof gives size in bytes (8 bits per byte)
+#define WORD_SIZE 64
+// #define WORD_SIZE (sizeof(word_t) * 8)
+
+// POPCOUNT is number of bits set
+
+#if defined(_WIN32)
+
+// See http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+static word_t __inline windows_popcount(word_t w)
+{
+  w = w - ((w >> 1) & (word_t)~(word_t)0/3);
+  w = (w & (word_t)~(word_t)0/15*3) + ((w >> 2) & (word_t)~(word_t)0/15*3);
+  w = (w + (w >> 4)) & (word_t)~(word_t)0/255*15;
+  c = (word_t)(w * ((word_t)~(word_t)0/255)) >> (sizeof(word_t) - 1) * 8;
+}
+
+static word_t __inline windows_parity(word_t w)
+{
+  w ^= w >> 1;
+  w ^= w >> 2;
+  w = (w & 0x1111111111111111UL) * 0x1111111111111111UL;
+  return (w >> 60) & 1;
+}
+
+#define POPCOUNT(x) windows_popcountl(x)
+#define PARITY(x) windows_parity(x)
+#else
+#define POPCOUNT(x) (unsigned)__builtin_popcountll(x)
+#define PARITY(x) (unsigned)__builtin_parityll(x)
+#endif
+
+#define MIN(a, b)  (((a) <= (b)) ? (a) : (b))
+#define MAX(a, b)  (((a) >= (b)) ? (a) : (b))
+
+// Make this a power of two
+#define INIT_CAPACITY_WORDS 2
+
+// word of all 1s
+#define WORD_MAX  (~(word_t)0)
+
+#define SET_REGION(arr,start,len)    _set_region((arr),(start),(len),FILL_REGION)
+#define CLEAR_REGION(arr,start,len)  _set_region((arr),(start),(len),ZERO_REGION)
+#define TOGGLE_REGION(arr,start,len) _set_region((arr),(start),(len),SWAP_REGION)
+
+// Have we initialised with srand() ?
+static char rand_initiated = 0;
+
+static void _seed_rand()
+{
+  if(!rand_initiated)
+  {
+    // Initialise random number generator
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    srand((((time.tv_sec ^ getpid()) * 1000001) + time.tv_usec));
+    rand_initiated = 1;
+  }
+}
+
+//
+// Common internal functions
+//
+
+#define bits_in_top_word(nbits) ((nbits) ? bitset64_idx((nbits) - 1) + 1 : 0)
+
+// Mostly used for debugging
+static inline void _print_word(word_t word, FILE* out)
+{
+  word_offset_t i;
+  for(i = 0; i < WORD_SIZE; i++)
+  {
+    fprintf(out, "%c", ((word >> i) & (word_t)0x1) == 0 ? '0' : '1');
+  }
+}
+
+// prints right to left
+static inline char* _word_to_str(word_t word, char str[WORD_SIZE+1])
+  __attribute__((unused));
+
+static inline char* _word_to_str(word_t word, char str[WORD_SIZE+1])
+{
+  word_offset_t i;
+  for(i = 0; i < WORD_SIZE; i++)
+  {
+    str[WORD_SIZE-i-1] = ((word >> i) & (word_t)0x1) == 0 ? '0' : '1';
+  }
+  str[WORD_SIZE] = '\0';
+  return str;
+}
+
+// Used in debugging
+#ifdef DEBUG
+  #define DEBUG_PRINT(msg,...) printf("[%s:%i] "msg, __FILE__, __LINE__, ##__VA_ARGS__);
+  #define DEBUG_VALIDATE(a) validate_bitarr((a), __FILE__, __LINE__)
+#else
+  #define DEBUG_PRINT(msg,...)
+  #define DEBUG_VALIDATE(a)
+#endif
+
+void validate_bitarr(BIT_ARRAY *arr, const char *file, int lineno)
+{
+  // Check top word is masked
+  word_addr_t tw = arr->num_of_words == 0 ? 0 : arr->num_of_words - 1;
+  bit_index_t top_bits = bits_in_top_word(arr->num_of_bits);
+  int err = 0;
+
+  if(arr->words[tw] > bitmask64(top_bits))
+  {
+    _print_word(arr->words[tw], stderr);
+    fprintf(stderr, "\n[%s:%i] Expected %i bits in top word[%i]\n",
+            file, lineno, (int)top_bits, (int)tw);
+    err = 1;
+  }
+
+  // Check num of words is correct
+  word_addr_t num_words = roundup_bits2words64(arr->num_of_bits);
+  if(num_words != arr->num_of_words)
+  {
+    fprintf(stderr, "\n[%s:%i] num of words wrong "
+                    "[bits: %i, word: %i, actual words: %i]\n", file, lineno,
+            (int)arr->num_of_bits, (int)num_words, (int)arr->num_of_words);
+    err = 1;
+  }
+
+  if(err) abort();
+}
+
+// Reverse a word
+static inline word_t _reverse_word(word_t word)
+{
+  word_t reverse = (reverse_table[(word)       & 0xff] << 56) |
+                   (reverse_table[(word >>  8) & 0xff] << 48) |
+                   (reverse_table[(word >> 16) & 0xff] << 40) |
+                   (reverse_table[(word >> 24) & 0xff] << 32) |
+                   (reverse_table[(word >> 32) & 0xff] << 24) |
+                   (reverse_table[(word >> 40) & 0xff] << 16) |
+                   (reverse_table[(word >> 48) & 0xff] << 8) |
+                   (reverse_table[(word >> 56) & 0xff]);
+
+  return reverse;
+}
+
+static inline void _mask_top_word(BIT_ARRAY* bitarr)
+{
+  // Mask top word
+  word_addr_t num_of_words = MAX(1, bitarr->num_of_words);
+  word_offset_t bits_active = bits_in_top_word(bitarr->num_of_bits);
+  bitarr->words[num_of_words-1] &= bitmask64(bits_active);
+}
+
+//
+// Get and set words (internal use only -- no bounds checking)
+//
+
+static inline word_t _get_word(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  word_addr_t word_index = bitset64_wrd(start);
+  word_offset_t word_offset = bitset64_idx(start);
+
+  word_t result = bitarr->words[word_index] >> word_offset;
+
+  word_offset_t bits_taken = WORD_SIZE - word_offset;
+
+  // word_offset is now the number of bits we need from the next word
+  // Check the next word has at least some bits
+  if(word_offset > 0 && start + bits_taken < bitarr->num_of_bits)
+  {
+    result |= bitarr->words[word_index+1] << (WORD_SIZE - word_offset);
+  }
+
+  return result;
+}
+
+// Set 64 bits from a particular start position
+// Doesn't extend bit array
+static inline void _set_word(BIT_ARRAY* bitarr, bit_index_t start, word_t word)
+{
+  word_addr_t word_index = bitset64_wrd(start);
+  word_offset_t word_offset = bitset64_idx(start);
+
+  if(word_offset == 0)
+  {
+    bitarr->words[word_index] = word;
+  }
+  else
+  {
+    bitarr->words[word_index]
+      = (word << word_offset) |
+        (bitarr->words[word_index] & bitmask64(word_offset));
+
+    if(word_index+1 < bitarr->num_of_words)
+    {
+      bitarr->words[word_index+1]
+        = (word >> (WORD_SIZE - word_offset)) |
+          (bitarr->words[word_index+1] & (WORD_MAX << word_offset));
+    }
+  }
+
+  // Mask top word
+  _mask_top_word(bitarr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+static inline void _set_byte(BIT_ARRAY *bitarr, bit_index_t start, uint8_t byte)
+{
+  word_t w = _get_word(bitarr, start);
+  _set_word(bitarr, start, (w & ~(word_t)0xff) | byte);
+}
+
+// 4 bits
+static inline void _set_nibble(BIT_ARRAY *bitarr, bit_index_t start,
+                               uint8_t nibble)
+{
+  word_t w = _get_word(bitarr, start);
+  _set_word(bitarr, start, (w & ~(word_t)0xf) | nibble);
+}
+
+// Wrap around
+static inline word_t _get_word_cyclic(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  word_t word = _get_word(bitarr, start);
+
+  bit_index_t bits_taken = bitarr->num_of_bits - start;
+
+  if(bits_taken < WORD_SIZE)
+  {
+    word |= (bitarr->words[0] << bits_taken);
+
+    if(bitarr->num_of_bits < (bit_index_t)WORD_SIZE)
+    {
+      // Mask word to prevent repetition of the same bits
+      word = word & bitmask64(bitarr->num_of_bits);
+    }
+  }
+
+  return word;
+}
+
+// Wrap around
+static inline void _set_word_cyclic(BIT_ARRAY* bitarr,
+                                    bit_index_t start, word_t word)
+{
+  _set_word(bitarr, start, word);
+
+  bit_index_t bits_set = bitarr->num_of_bits - start;
+
+  if(bits_set < WORD_SIZE && start > 0)
+  {
+    word >>= bits_set;
+
+    // Prevent overwriting the bits we've just set
+    // by setting 'start' as the upper bound for the number of bits to write
+    word_offset_t bits_remaining = MIN(WORD_SIZE - bits_set, start);
+    word_t mask = bitmask64(bits_remaining);
+
+    bitarr->words[0] = bitmask_merge(word, bitarr->words[0], mask);
+  }
+}
+
+//
+// Fill a region (internal use only)
+//
+
+// FillAction is fill with 0 or 1 or toggle
+typedef enum {ZERO_REGION, FILL_REGION, SWAP_REGION} FillAction;
+
+static inline void _set_region(BIT_ARRAY* bitarr, bit_index_t start,
+                               bit_index_t length, FillAction action)
+{
+  if(length == 0) return;
+
+  word_addr_t first_word = bitset64_wrd(start);
+  word_addr_t last_word = bitset64_wrd(start+length-1);
+  word_offset_t foffset = bitset64_idx(start);
+  word_offset_t loffset = bitset64_idx(start+length-1);
+
+  if(first_word == last_word)
+  {
+    word_t mask = bitmask64(length) << foffset;
+
+    switch(action)
+    {
+      case ZERO_REGION: bitarr->words[first_word] &= ~mask; break;
+      case FILL_REGION: bitarr->words[first_word] |=  mask; break;
+      case SWAP_REGION: bitarr->words[first_word] ^=  mask; break;
+    }
+  }
+  else
+  {
+    // Set first word
+    switch(action)
+    {
+      case ZERO_REGION: bitarr->words[first_word] &=  bitmask64(foffset); break;
+      case FILL_REGION: bitarr->words[first_word] |= ~bitmask64(foffset); break;
+      case SWAP_REGION: bitarr->words[first_word] ^= ~bitmask64(foffset); break;
+    }
+
+    word_addr_t i;
+
+    // Set whole words
+    switch(action)
+    {
+      case ZERO_REGION:
+        for(i = first_word + 1; i < last_word; i++)
+          bitarr->words[i] = (word_t)0;
+        break;
+      case FILL_REGION:
+        for(i = first_word + 1; i < last_word; i++)
+          bitarr->words[i] = WORD_MAX;
+        break;
+      case SWAP_REGION:
+        for(i = first_word + 1; i < last_word; i++)
+          bitarr->words[i] ^= WORD_MAX;
+        break;
+    }
+
+    // Set last word
+    switch(action)
+    {
+      case ZERO_REGION: bitarr->words[last_word] &= ~bitmask64(loffset+1); break;
+      case FILL_REGION: bitarr->words[last_word] |=  bitmask64(loffset+1); break;
+      case SWAP_REGION: bitarr->words[last_word] ^=  bitmask64(loffset+1); break;
+    }
+  }
+}
+
+
+
+//
+// Constructor
+//
+
+// If cannot allocate memory, set errno to ENOMEM, return NULL
+BIT_ARRAY* bit_array_alloc(BIT_ARRAY* bitarr, bit_index_t nbits)
+{
+  bitarr->num_of_bits = nbits;
+  bitarr->num_of_words = roundup_bits2words64(nbits);
+  bitarr->capacity_in_words = MAX(8, roundup2pow(bitarr->num_of_words));
+  bitarr->words = (word_t*)calloc(bitarr->capacity_in_words, sizeof(word_t));
+  if(bitarr->words == NULL) {
+    errno = ENOMEM;
+    return NULL;
+  }
+  return bitarr;
+}
+
+void bit_array_dealloc(BIT_ARRAY* bitarr)
+{
+  free(bitarr->words);
+  memset(bitarr, 0, sizeof(BIT_ARRAY));
+}
+
+// If cannot allocate memory, set errno to ENOMEM, return NULL
+BIT_ARRAY* bit_array_create(bit_index_t nbits)
+{
+  BIT_ARRAY* bitarr = (BIT_ARRAY*)malloc(sizeof(BIT_ARRAY));
+
+  // error if could not allocate enough memory
+  if(bitarr == NULL || bit_array_alloc(bitarr, nbits) == NULL)
+  {
+    if(bitarr != NULL) free(bitarr);
+    errno = ENOMEM;
+    return NULL;
+  }
+
+  DEBUG_PRINT("Creating BIT_ARRAY (bits: %lu; allocated words: %lu; "
+              "using words: %lu; WORD_SIZE: %i)\n",
+              (unsigned long)nbits, (unsigned long)bitarr->capacity_in_words,
+              (unsigned long)roundup_bits2words64(nbits), (int)WORD_SIZE);
+
+  DEBUG_VALIDATE(bitarr);
+
+  return bitarr;
+}
+
+//
+// Destructor
+//
+void bit_array_free(BIT_ARRAY* bitarr)
+{
+  if(bitarr->words != NULL)
+    free(bitarr->words);
+
+  free(bitarr);
+}
+
+bit_index_t bit_array_length(const BIT_ARRAY* bit_arr)
+{
+  return bit_arr->num_of_bits;
+}
+
+// Change the size of a bit array. Enlarging an array will add zeros
+// to the end of it. Returns 1 on success, 0 on failure (e.g. not enough memory)
+char bit_array_resize(BIT_ARRAY* bitarr, bit_index_t new_num_of_bits)
+{
+  word_addr_t old_num_of_words = bitarr->num_of_words;
+  word_addr_t new_num_of_words = roundup_bits2words64(new_num_of_bits);
+
+  bitarr->num_of_bits = new_num_of_bits;
+  bitarr->num_of_words = new_num_of_words;
+
+  DEBUG_PRINT("Resize: old_num_of_words: %i; new_num_of_words: %i capacity: %i\n",
+              (int)old_num_of_words, (int)new_num_of_words,
+              (int)bitarr->capacity_in_words);
+
+  if(new_num_of_words > bitarr->capacity_in_words)
+  {
+    // Need to change the amount of memory used
+    word_addr_t old_capacity_in_words = bitarr->capacity_in_words;
+    size_t old_capacity_in_bytes = old_capacity_in_words * sizeof(word_t);
+
+    bitarr->capacity_in_words = roundup2pow(new_num_of_words);
+    bitarr->capacity_in_words = MAX(8, bitarr->capacity_in_words);
+
+    size_t new_capacity_in_bytes = bitarr->capacity_in_words * sizeof(word_t);
+    bitarr->words = (word_t*)realloc(bitarr->words, new_capacity_in_bytes);
+
+    if(bitarr->words == NULL)
+    {
+      // error - could not allocate enough memory
+      perror("resize realloc");
+      errno = ENOMEM;
+      return 0;
+    }
+
+    // Need to zero new memory
+    size_t num_bytes_to_zero = new_capacity_in_bytes - old_capacity_in_bytes;
+    memset(bitarr->words + old_capacity_in_words, 0, num_bytes_to_zero);
+
+    DEBUG_PRINT("zeroing from word %i for %i bytes\n", (int)old_capacity_in_words,
+                (int)num_bytes_to_zero);
+  }
+  else if(new_num_of_words < old_num_of_words)
+  {
+    // Shrunk -- need to zero old memory
+    size_t num_bytes_to_zero = (old_num_of_words - new_num_of_words)*sizeof(word_t);
+
+    memset(bitarr->words + new_num_of_words, 0, num_bytes_to_zero);
+  }
+
+  // Mask top word
+  _mask_top_word(bitarr);
+  DEBUG_VALIDATE(bitarr);
+  return 1;
+}
+
+void bit_array_resize_critical(BIT_ARRAY* bitarr, bit_index_t num_of_bits)
+{
+  bit_index_t old_num_of_bits = bitarr->num_of_bits;
+
+  if(!bit_array_resize(bitarr, num_of_bits))
+  {
+    fprintf(stderr, "Ran out of memory resizing [%lu -> %lu]",
+            (unsigned long)old_num_of_bits, (unsigned long)num_of_bits);
+    abort();
+  }
+}
+
+// If bitarr length < num_bits, resizes to num_bits
+char bit_array_ensure_size(BIT_ARRAY* bitarr, bit_index_t ensure_num_of_bits)
+{
+  if(bitarr->num_of_bits < ensure_num_of_bits)
+  {
+    return bit_array_resize(bitarr, ensure_num_of_bits);
+  }
+
+  return 1;
+}
+
+void bit_array_ensure_size_critical(BIT_ARRAY* bitarr, bit_index_t num_of_bits)
+{
+  if(num_of_bits > bitarr->num_of_bits)
+  {
+    bit_array_resize_critical(bitarr, num_of_bits);
+  }
+}
+
+static inline
+void _bit_array_ensure_nwords(BIT_ARRAY* bitarr, word_addr_t nwords,
+                              const char *file, int lineno, const char *func)
+{
+  size_t newmem, oldmem;
+  if(bitarr->capacity_in_words < nwords) {
+    oldmem = bitarr->capacity_in_words * sizeof(word_t);
+    bitarr->capacity_in_words = roundup2pow(nwords);
+    newmem = bitarr->capacity_in_words * sizeof(word_t);
+    bitarr->words = (word_t*)realloc(bitarr->words, newmem);
+
+    if(bitarr->words == NULL) {
+      fprintf(stderr, "[%s:%i:%s()] Ran out of memory resizing [%zu -> %zu]",
+              file, lineno, func, oldmem, newmem);
+      abort();
+    }
+
+    DEBUG_PRINT("Ensure nwords realloc %zu -> %zu\n", oldmem, newmem);
+  }
+}
+
+
+//
+// Get, set, clear, assign and toggle individual bits
+//
+
+// Get the value of a bit (returns 0 or 1)
+char bit_array_get_bit(const BIT_ARRAY* bitarr, bit_index_t b)
+{
+  assert(b < bitarr->num_of_bits);
+  return bit_array_get(bitarr, b);
+}
+
+// set a bit (to 1) at position b
+void bit_array_set_bit(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  assert(b < bitarr->num_of_bits);
+  bit_array_set(bitarr,b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// clear a bit (to 0) at position b
+void bit_array_clear_bit(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  assert(b < bitarr->num_of_bits);
+  bit_array_clear(bitarr, b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// If bit is 0 -> 1, if bit is 1 -> 0.  AKA 'flip'
+void bit_array_toggle_bit(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  assert(b < bitarr->num_of_bits);
+  bit_array_toggle(bitarr, b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// If char c != 0, set bit; otherwise clear bit
+void bit_array_assign_bit(BIT_ARRAY* bitarr, bit_index_t b, char c)
+{
+  assert(b < bitarr->num_of_bits);
+  bit_array_assign(bitarr, b, c ? 1 : 0);
+  DEBUG_VALIDATE(bitarr);
+}
+
+//
+// Get, set etc with resize
+//
+
+// Get the value of a bit (returns 0 or 1)
+char bit_array_rget(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  bit_array_ensure_size_critical(bitarr, b+1);
+  return bit_array_get(bitarr, b);
+}
+
+// set a bit (to 1) at position b
+void bit_array_rset(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  bit_array_ensure_size_critical(bitarr, b+1);
+  bit_array_set(bitarr,b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// clear a bit (to 0) at position b
+void bit_array_rclear(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  bit_array_ensure_size_critical(bitarr, b+1);
+  bit_array_clear(bitarr, b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// If bit is 0 -> 1, if bit is 1 -> 0.  AKA 'flip'
+void bit_array_rtoggle(BIT_ARRAY* bitarr, bit_index_t b)
+{
+  bit_array_ensure_size_critical(bitarr, b+1);
+  bit_array_toggle(bitarr, b);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// If char c != 0, set bit; otherwise clear bit
+void bit_array_rassign(BIT_ARRAY* bitarr, bit_index_t b, char c)
+{
+  bit_array_ensure_size_critical(bitarr, b+1);
+  bit_array_assign(bitarr, b, c ? 1 : 0);
+  DEBUG_VALIDATE(bitarr);
+}
+
+//
+// Set, clear and toggle several bits at once
+//
+
+// Set multiple bits at once.
+// e.g. set bits 1, 20 & 31: bit_array_set_bits(bitarr, 3, 1,20,31);
+void bit_array_set_bits(BIT_ARRAY* bitarr, size_t n, ...)
+{
+  size_t i;
+  va_list argptr;
+  va_start(argptr, n);
+
+  for(i = 0; i < n; i++)
+  {
+    unsigned int bit_index = va_arg(argptr, unsigned int);
+    bit_array_set_bit(bitarr, bit_index);
+  }
+
+  va_end(argptr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Clear multiple bits at once.
+// e.g. clear bits 1, 20 & 31: bit_array_clear_bits(bitarr, 3, 1,20,31);
+void bit_array_clear_bits(BIT_ARRAY* bitarr, size_t n, ...)
+{
+  size_t i;
+  va_list argptr;
+  va_start(argptr, n);
+
+  for(i = 0; i < n; i++)
+  {
+    unsigned int bit_index = va_arg(argptr, unsigned int);
+    bit_array_clear_bit(bitarr, bit_index);
+  }
+
+  va_end(argptr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Toggle multiple bits at once
+// e.g. toggle bits 1, 20 & 31: bit_array_toggle_bits(bitarr, 3, 1,20,31);
+void bit_array_toggle_bits(BIT_ARRAY* bitarr, size_t n, ...)
+{
+  size_t i;
+  va_list argptr;
+  va_start(argptr, n);
+
+  for(i = 0; i < n; i++)
+  {
+    unsigned int bit_index = va_arg(argptr, unsigned int);
+    bit_array_toggle_bit(bitarr, bit_index);
+  }
+
+  va_end(argptr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+
+//
+// Set, clear and toggle all bits in a region
+//
+
+// Set all the bits in a region
+void bit_array_set_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len)
+{
+  assert(start + len <= bitarr->num_of_bits);
+  SET_REGION(bitarr, start, len);
+  DEBUG_VALIDATE(bitarr);
+}
+
+
+// Clear all the bits in a region
+void bit_array_clear_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len)
+{
+  assert(start + len <= bitarr->num_of_bits);
+  CLEAR_REGION(bitarr, start, len);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Toggle all the bits in a region
+void bit_array_toggle_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len)
+{
+  assert(start + len <= bitarr->num_of_bits);
+  TOGGLE_REGION(bitarr, start, len);
+  DEBUG_VALIDATE(bitarr);
+}
+
+
+//
+// Set, clear and toggle all bits at once
+//
+
+// set all elements of data to one
+void bit_array_set_all(BIT_ARRAY* bitarr)
+{
+  bit_index_t num_of_bytes = bitarr->num_of_words * sizeof(word_t);
+  memset(bitarr->words, 0xFF, num_of_bytes);
+  _mask_top_word(bitarr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// set all elements of data to zero
+void bit_array_clear_all(BIT_ARRAY* bitarr)
+{
+  memset(bitarr->words, 0, bitarr->num_of_words * sizeof(word_t));
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Set all 1 bits to 0, and all 0 bits to 1. AKA flip
+void bit_array_toggle_all(BIT_ARRAY* bitarr)
+{
+  word_addr_t i;
+  for(i = 0; i < bitarr->num_of_words; i++)
+  {
+    bitarr->words[i] ^= WORD_MAX;
+  }
+
+  _mask_top_word(bitarr);
+  DEBUG_VALIDATE(bitarr);
+}
+
+//
+// Get a word at a time
+//
+
+uint64_t bit_array_get_word64(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  assert(start < bitarr->num_of_bits);
+  return (uint64_t)_get_word(bitarr, start);
+}
+
+uint32_t bit_array_get_word32(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  assert(start < bitarr->num_of_bits);
+  return (uint32_t)_get_word(bitarr, start);
+}
+
+uint16_t bit_array_get_word16(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  assert(start < bitarr->num_of_bits);
+  return (uint16_t)_get_word(bitarr, start);
+}
+
+uint8_t bit_array_get_word8(const BIT_ARRAY* bitarr, bit_index_t start)
+{
+  assert(start < bitarr->num_of_bits);
+  return (uint8_t)_get_word(bitarr, start);
+}
+
+uint64_t bit_array_get_wordn(const BIT_ARRAY* bitarr, bit_index_t start, int n)
+{
+  assert(start < bitarr->num_of_bits);
+  assert(n <= 64);
+  return (uint64_t)(_get_word(bitarr, start) & bitmask64(n));
+}
+
+//
+// Set a word at a time
+//
+
+void bit_array_set_word64(BIT_ARRAY* bitarr, bit_index_t start, uint64_t word)
+{
+  assert(start < bitarr->num_of_bits);
+  _set_word(bitarr, start, (word_t)word);
+}
+
+void bit_array_set_word32(BIT_ARRAY* bitarr, bit_index_t start, uint32_t word)
+{
+  assert(start < bitarr->num_of_bits);
+  word_t w = _get_word(bitarr, start);
+  _set_word(bitarr, start, (w & ~(word_t)0xffffffff) | word);
+}
+
+void bit_array_set_word16(BIT_ARRAY* bitarr, bit_index_t start, uint16_t word)
+{
+  assert(start < bitarr->num_of_bits);
+  word_t w = _get_word(bitarr, start);
+  _set_word(bitarr, start, (w & ~(word_t)0xffff) | word);
+}
+
+void bit_array_set_word8(BIT_ARRAY* bitarr, bit_index_t start, uint8_t byte)
+{
+  assert(start < bitarr->num_of_bits);
+  _set_byte(bitarr, start, byte);
+}
+
+void bit_array_set_wordn(BIT_ARRAY* bitarr, bit_index_t start, uint64_t word, int n)
+{
+  assert(start < bitarr->num_of_bits);
+  assert(n <= 64);
+  word_t w = _get_word(bitarr, start), m = bitmask64(n);
+  _set_word(bitarr, start, bitmask_merge(word,w,m));
+}
+
+//
+// Number of bits set
+//
+
+// Get the number of bits set (hamming weight)
+bit_index_t bit_array_num_bits_set(const BIT_ARRAY* bitarr)
+{
+  word_addr_t i;
+
+  bit_index_t num_of_bits_set = 0;
+
+  for(i = 0; i < bitarr->num_of_words; i++)
+  {
+    if(bitarr->words[i] > 0)
+    {
+      num_of_bits_set += POPCOUNT(bitarr->words[i]);
+    }
+  }
+
+  return num_of_bits_set;
+}
+
+// Get the number of bits not set (1 - hamming weight)
+bit_index_t bit_array_num_bits_cleared(const BIT_ARRAY* bitarr)
+{
+  return bitarr->num_of_bits - bit_array_num_bits_set(bitarr);
+}
+
+
+// Get the number of bits set in on array and not the other.  This is equivalent
+// to hamming weight of the XOR when the two arrays are the same length.
+// e.g. 10101 vs 00111 => hamming distance 2 (XOR is 10010)
+bit_index_t bit_array_hamming_distance(const BIT_ARRAY* arr1,
+                                       const BIT_ARRAY* arr2)
+{
+  word_addr_t min_words = MIN(arr1->num_of_words, arr2->num_of_words);
+  word_addr_t max_words = MAX(arr1->num_of_words, arr2->num_of_words);
+
+  bit_index_t hamming_distance = 0;
+  word_addr_t i;
+
+  for(i = 0; i < min_words; i++)
+  {
+    hamming_distance += POPCOUNT(arr1->words[i] ^ arr2->words[i]);
+  }
+
+  if(min_words != max_words)
+  {
+    const BIT_ARRAY* long_arr
+      = (arr1->num_of_words > arr2->num_of_words ? arr1 : arr2);
+
+    for(i = min_words; i < max_words; i++)
+    {
+      hamming_distance += POPCOUNT(long_arr->words[i]);
+    }
+  }
+
+  return hamming_distance;
+}
+
+// Parity - returns 1 if odd number of bits set, 0 if even
+char bit_array_parity(const BIT_ARRAY* bitarr)
+{
+  word_addr_t w;
+  unsigned int parity = 0;
+
+  for(w = 0; w < bitarr->num_of_words; w++)
+  {
+    parity ^= PARITY(bitarr->words[w]);
+  }
+
+  return (char)parity;
+}
+
+//
+// Find indices of set/clear bits
+//
+
+// Find the index of the next bit that is set/clear, at or after `offset`
+// Returns 1 if such a bit is found, otherwise 0
+// Index is stored in the integer pointed to by `result`
+// If no such bit is found, value at `result` is not changed
+#define _next_bit_func_def(FUNC,GET) \
+char FUNC(const BIT_ARRAY* bitarr, bit_index_t offset, bit_index_t* result) \
+{ \
+  assert(offset < bitarr->num_of_bits); \
+  if(bitarr->num_of_bits == 0 || offset >= bitarr->num_of_bits) { return 0; } \
+ \
+  /* Find first word that is greater than zero */ \
+  word_addr_t i = bitset64_wrd(offset); \
+  word_t w = GET(bitarr->words[i]) & ~bitmask64(bitset64_idx(offset)); \
+ \
+  while(1) { \
+    if(w > 0) { \
+      bit_index_t pos = i * WORD_SIZE + trailing_zeros(w); \
+      if(pos < bitarr->num_of_bits) { *result = pos; return 1; } \
+      else { return 0; } \
+    } \
+    i++; \
+    if(i >= bitarr->num_of_words) break; \
+    w = GET(bitarr->words[i]); \
+  } \
+ \
+  return 0; \
+}
+
+// Find the index of the previous bit that is set/clear, before `offset`.
+// Returns 1 if such a bit is found, otherwise 0
+// Index is stored in the integer pointed to by `result`
+// If no such bit is found, value at `result` is not changed
+#define _prev_bit_func_def(FUNC,GET) \
+char FUNC(const BIT_ARRAY* bitarr, bit_index_t offset, bit_index_t* result) \
+{ \
+  assert(offset <= bitarr->num_of_bits); \
+  if(bitarr->num_of_bits == 0 || offset == 0) { return 0; } \
+ \
+  /* Find prev word that is greater than zero */ \
+  word_addr_t i = bitset64_wrd(offset-1); \
+  word_t w = GET(bitarr->words[i]) & bitmask64(bitset64_idx(offset-1)+1); \
+ \
+  if(w > 0) { *result = (i+1) * WORD_SIZE - leading_zeros(w) - 1; return 1; } \
+ \
+  /* i is unsigned so have to use break when i == 0 */ \
+  for(--i; i != BIT_INDEX_MAX; i--) { \
+    w = GET(bitarr->words[i]); \
+    if(w > 0) { \
+      *result = (i+1) * WORD_SIZE - leading_zeros(w) - 1; \
+      return 1; \
+    } \
+  } \
+ \
+  return 0; \
+}
+
+#define GET_WORD(x) (x)
+#define NEG_WORD(x) (~(x))
+_next_bit_func_def(bit_array_find_next_set_bit,  GET_WORD);
+_next_bit_func_def(bit_array_find_next_clear_bit,NEG_WORD);
+_prev_bit_func_def(bit_array_find_prev_set_bit,  GET_WORD);
+_prev_bit_func_def(bit_array_find_prev_clear_bit,NEG_WORD);
+
+// Find the index of the first bit that is set.
+// Returns 1 if a bit is set, otherwise 0
+// Index of first set bit is stored in the integer pointed to by result
+// If no bits are set, value at `result` is not changed
+char bit_array_find_first_set_bit(const BIT_ARRAY* bitarr, bit_index_t* result)
+{
+  return bit_array_find_next_set_bit(bitarr, 0, result);
+}
+
+// same same
+char bit_array_find_first_clear_bit(const BIT_ARRAY* bitarr, bit_index_t* result)
+{
+  return bit_array_find_next_clear_bit(bitarr, 0, result);
+}
+
+// Find the index of the last bit that is set.
+// Returns 1 if a bit is set, otherwise 0
+// Index of last set bit is stored in the integer pointed to by `result`
+// If no bits are set, value at `result` is not changed
+char bit_array_find_last_set_bit(const BIT_ARRAY* bitarr, bit_index_t* result)
+{
+  return bit_array_find_prev_set_bit(bitarr, bitarr->num_of_bits, result);
+}
+
+// same same
+char bit_array_find_last_clear_bit(const BIT_ARRAY* bitarr, bit_index_t* result)
+{
+  return bit_array_find_prev_clear_bit(bitarr, bitarr->num_of_bits, result);
+}
+
+//
+// "Sorting" bits
+//
+
+// Put all the 0s before all the 1s
+void bit_array_sort_bits(BIT_ARRAY* bitarr)
+{
+  bit_index_t num_of_bits_set = bit_array_num_bits_set(bitarr);
+  bit_index_t num_of_bits_cleared = bitarr->num_of_bits - num_of_bits_set;
+  bit_array_set_all(bitarr);
+  CLEAR_REGION(bitarr, 0, num_of_bits_cleared);
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Put all the 1s before all the 0s
+void bit_array_sort_bits_rev(BIT_ARRAY* bitarr)
+{
+  bit_index_t num_of_bits_set = bit_array_num_bits_set(bitarr);
+  bit_array_clear_all(bitarr);
+  SET_REGION(bitarr, 0, num_of_bits_set);
+  DEBUG_VALIDATE(bitarr);
+}
+
+
+//
+// Strings and printing
+//
+
+// Construct a BIT_ARRAY from a substring with given on and off characters.
+void bit_array_from_substr(BIT_ARRAY* bitarr, bit_index_t offset,
+                           const char *str, size_t len,
+                           const char *on, const char *off,
+                           char left_to_right)
+{
+  bit_array_ensure_size(bitarr, offset + len);
+  bit_array_clear_region(bitarr, offset, len);
+
+  // BitArray region is now all 0s -- just set the 1s
+  size_t i;
+  bit_index_t j;
+
+  for(i = 0; i < len; i++)
+  {
+    if(strchr(on, str[i]) != NULL)
+    {
+      j = offset + (left_to_right ? i : len - i - 1);
+      bit_array_set(bitarr, j);
+    }
+    else { assert(strchr(off, str[i]) != NULL); }
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+// From string method
+void bit_array_from_str(BIT_ARRAY* bitarr, const char* str)
+{
+  bit_array_from_substr(bitarr, 0, str, strlen(str), "1", "0", 1);
+}
+
+// Takes a char array to write to.  `str` must be bitarr->num_of_bits+1 in length
+// Terminates string with '\0'
+char* bit_array_to_str(const BIT_ARRAY* bitarr, char* str)
+{
+  bit_index_t i;
+
+  for(i = 0; i < bitarr->num_of_bits; i++)
+  {
+    str[i] = bit_array_get(bitarr, i) ? '1' : '0';
+  }
+
+  str[bitarr->num_of_bits] = '\0';
+
+  return str;
+}
+
+char* bit_array_to_str_rev(const BIT_ARRAY* bitarr, char* str)
+{
+  bit_index_t i;
+
+  for(i = 0; i < bitarr->num_of_bits; i++)
+  {
+    str[i] = bit_array_get(bitarr, bitarr->num_of_bits-i-1) ? '1' : '0';
+  }
+
+  str[bitarr->num_of_bits] = '\0';
+
+  return str;
+}
+
+
+// Get a string representations for a given region, using given on/off characters.
+// Note: does not null-terminate
+void bit_array_to_substr(const BIT_ARRAY* bitarr,
+                         bit_index_t start, bit_index_t length,
+                         char* str, char on, char off,
+                         char left_to_right)
+{
+  assert(start + length <= bitarr->num_of_bits);
+
+  bit_index_t i, j;
+  bit_index_t end = start + length - 1;
+
+  for(i = 0; i < length; i++)
+  {
+    j = (left_to_right ? start + i : end - i);
+    str[i] = bit_array_get(bitarr, j) ? on : off;
+  }
+
+//  str[length] = '\0';
+}
+
+// Print this array to a file stream.  Prints '0's and '1'.  Doesn't print newline.
+void bit_array_print(const BIT_ARRAY* bitarr, FILE* fout)
+{
+  bit_index_t i;
+
+  for(i = 0; i < bitarr->num_of_bits; i++)
+  {
+    fprintf(fout, "%c", bit_array_get(bitarr, i) ? '1' : '0');
+  }
+}
+
+// Print a string representations for a given region, using given on/off characters.
+void bit_array_print_substr(const BIT_ARRAY* bitarr,
+                            bit_index_t start, bit_index_t length,
+                            FILE* fout, char on, char off,
+                            char left_to_right)
+{
+  assert(start + length <= bitarr->num_of_bits);
+
+  bit_index_t i, j;
+  bit_index_t end = start + length - 1;
+
+  for(i = 0; i < length; i++)
+  {
+    j = (left_to_right ? start + i : end - i);
+    fprintf(fout, "%c", bit_array_get(bitarr, j) ? on : off);
+  }
+}
+
+//
+// Decimal
+//
+
+// Get bit array as decimal str (e.g. 0b1101 -> "13")
+// len is the length of str char array -- will write at most len-1 chars
+// returns the number of characters needed
+// return is the same as strlen(str)
+size_t bit_array_to_decimal(const BIT_ARRAY *bitarr, char *str, size_t len)
+{
+  size_t i = 0;
+
+  if(bit_array_cmp_uint64(bitarr, 0) == 0)
+  {
+    if(len >= 2)
+    {
+      *str = '0';
+      *(str+1) = '\0';
+    }
+
+    return 1;
+  }
+
+  BIT_ARRAY *tmp = bit_array_clone(bitarr);
+  uint64_t rem;
+
+  str[len-1] = '\0';
+
+  while(bit_array_cmp_uint64(tmp, 0) != 0)
+  {
+    bit_array_div_uint64(tmp, 10, &rem);
+
+    if(i < len-1)
+    {
+      str[len-2-i] = '0' + rem;
+    }
+
+    i++;
+  }
+
+  if(i < len-1)
+  {
+    // Moves null-terminator as well
+    memmove(str, str+len-i-1, i+1);
+  }
+
+  bit_array_free(tmp);
+
+  return i;
+}
+
+// Get bit array from decimal str (e.g. "13" -> 0b1101)
+// Returns number of characters used
+size_t bit_array_from_decimal(BIT_ARRAY *bitarr, const char* decimal)
+{
+  bit_array_clear_all(bitarr);
+  size_t i = 0;
+
+  if(decimal[0] == '\0' || decimal[0] < '0' || decimal[0] > '9')
+  {
+    return 0;
+  }
+
+  bit_array_add_uint64(bitarr, decimal[i] - '0');
+  i++;
+
+  while(decimal[i] != '\0' && decimal[i] >= '0' && decimal[i] <= '9')
+  {
+    bit_array_mul_uint64(bitarr, 10);
+    bit_array_add_uint64(bitarr, decimal[i] - '0');
+    i++;
+  }
+
+  return i;
+}
+
+//
+// Hexidecimal
+//
+
+char bit_array_hex_to_nibble(char c, uint8_t *b)
+{
+  c = tolower(c);
+
+  if(c >= '0' && c <= '9')
+  {
+    *b = c - '0';
+    return 1;
+  }
+  else if(c >= 'a' && c <= 'f')
+  {
+    *b = 0xa + (c - 'a');
+    return 1;
+  }
+  else
+  {
+    return 0;
+  }
+}
+
+char bit_array_nibble_to_hex(uint8_t b, char uppercase)
+{
+  if(b <= 9)
+  {
+    return '0' + b;
+  }
+  else
+  {
+    return (uppercase ? 'A' : 'a') + (b - 0xa);
+  }
+}
+
+// Loads array from hex string
+// Returns the number of bits loaded (will be chars rounded up to multiple of 4)
+// (0 on failure)
+bit_index_t bit_array_from_hex(BIT_ARRAY* bitarr, bit_index_t offset,
+                               const char* str, size_t len)
+{
+  if(str[0] == '0' && tolower(str[1]) == 'x')
+  {
+    str += 2;
+    len -= 2;
+  }
+
+  size_t i;
+  for(i = 0; i < len; i++, offset += 4)
+  {
+    uint8_t b;
+    if(bit_array_hex_to_nibble(str[i], &b))
+    {
+      bit_array_ensure_size(bitarr, offset + 4);
+      _set_nibble(bitarr, offset, b);
+    }
+    else
+    {
+      break;
+    }
+  }
+
+  return 4 * i;
+}
+
+// Returns number of characters written
+size_t bit_array_to_hex(const BIT_ARRAY* bitarr,
+                        bit_index_t start, bit_index_t length,
+                        char* str, char uppercase)
+{
+  assert(start + length <= bitarr->num_of_bits);
+
+  size_t k = 0;
+  bit_index_t offset, end = start + length;
+
+  for(offset = start; offset + WORD_SIZE <= end; offset += WORD_SIZE)
+  {
+    word_t w = _get_word(bitarr, offset);
+
+    word_offset_t j;
+    for(j = 0; j < 64; j += 4)
+    {
+      str[k++] = bit_array_nibble_to_hex((w>>j) & 0xf, uppercase);
+    }
+  }
+
+  if(offset < end)
+  {
+    // Remaining full nibbles (4 bits)
+    word_t w = _get_word(bitarr, offset);
+
+    for(; offset + 4 <= end; offset += 4)
+    {
+      str[k++] = bit_array_nibble_to_hex(w & 0xf, uppercase);
+      w >>= 4;
+    }
+
+    if(offset < end)
+    {
+      // Remaining bits
+      str[k++] = bit_array_nibble_to_hex(w & bitmask64(end - offset), uppercase);
+    }
+  }
+
+  str[k] = '\0';
+
+  // Return number of characters written
+  return k;
+}
+
+// Print bit array as hex
+size_t bit_array_print_hex(const BIT_ARRAY* bitarr,
+                           bit_index_t start, bit_index_t length,
+                           FILE* fout, char uppercase)
+{
+  assert(start + length <= bitarr->num_of_bits);
+
+  size_t k = 0;
+  bit_index_t offset, end = start + length;
+
+  for(offset = start; offset + WORD_SIZE <= end; offset += WORD_SIZE)
+  {
+    word_t w = _get_word(bitarr, offset);
+
+    word_offset_t j;
+    for(j = 0; j < 64; j += 4)
+    {
+      fprintf(fout, "%c", bit_array_nibble_to_hex((w>>j) & 0xf, uppercase));
+      k++;
+    }
+  }
+
+  if(offset < end)
+  {
+    // Remaining full nibbles (4 bits)
+    word_t w = _get_word(bitarr, offset);
+
+    for(; offset + 4 <= end; offset += 4)
+    {
+      fprintf(fout, "%c", bit_array_nibble_to_hex(w & 0xf, uppercase));
+      w >>= 4;
+      k++;
+    }
+
+    if(offset < end)
+    {
+      // Remaining bits
+      char hex = bit_array_nibble_to_hex(w & bitmask64(end - offset), uppercase);
+      fprintf(fout, "%c", hex);
+      k++;
+    }
+  }
+
+  return k;
+}
+
+//
+// Clone and copy
+//
+
+// Returns NULL if cannot malloc
+BIT_ARRAY* bit_array_clone(const BIT_ARRAY* bitarr)
+{
+  BIT_ARRAY* cpy = bit_array_create(bitarr->num_of_bits);
+
+  if(cpy == NULL)
+  {
+    return NULL;
+  }
+
+  // Copy across bits
+  memcpy(cpy->words, bitarr->words, bitarr->num_of_words * sizeof(word_t));
+
+  DEBUG_VALIDATE(cpy);
+  return cpy;
+}
+
+// destination and source may be the same bit_array
+// and src/dst regions may overlap
+static void _array_copy(BIT_ARRAY* dst, bit_index_t dstindx,
+                        const BIT_ARRAY* src, bit_index_t srcindx,
+                        bit_index_t length)
+{
+  DEBUG_PRINT("bit_array_copy(dst: %zu, src: %zu, length: %zu)\n",
+              (size_t)dstindx, (size_t)srcindx, (size_t)length);
+
+  // Num of full words to copy
+  word_addr_t num_of_full_words = length / WORD_SIZE;
+  word_addr_t i;
+
+  word_offset_t bits_in_last_word = bits_in_top_word(length);
+
+  if(dst == src && srcindx > dstindx)
+  {
+    // Work left to right
+    DEBUG_PRINT("work left to right\n");
+
+    for(i = 0; i < num_of_full_words; i++)
+    {
+      word_t word = _get_word(src, srcindx+i*WORD_SIZE);
+      _set_word(dst, dstindx+i*WORD_SIZE, word);
+    }
+
+    if(bits_in_last_word > 0)
+    {
+      word_t src_word = _get_word(src, srcindx+i*WORD_SIZE);
+      word_t dst_word = _get_word(dst, dstindx+i*WORD_SIZE);
+
+      word_t mask = bitmask64(bits_in_last_word);
+      word_t word = bitmask_merge(src_word, dst_word, mask);
+
+      _set_word(dst, dstindx+num_of_full_words*WORD_SIZE, word);
+    }
+  }
+  else
+  {
+    // Work right to left
+    DEBUG_PRINT("work right to left\n");
+
+    for(i = 0; i < num_of_full_words; i++)
+    {
+      word_t word = _get_word(src, srcindx+length-(i+1)*WORD_SIZE);
+      _set_word(dst, dstindx+length-(i+1)*WORD_SIZE, word);
+    }
+
+    DEBUG_PRINT("Copy %i,%i to %i\n", (int)srcindx, (int)bits_in_last_word,
+                                      (int)dstindx);
+
+    if(bits_in_last_word > 0)
+    {
+      word_t src_word = _get_word(src, srcindx);
+      word_t dst_word = _get_word(dst, dstindx);
+
+      word_t mask = bitmask64(bits_in_last_word);
+      word_t word = bitmask_merge(src_word, dst_word, mask);
+      _set_word(dst, dstindx, word);
+    }
+  }
+
+  _mask_top_word(dst);
+}
+
+// destination and source may be the same bit_array
+// and src/dst regions may overlap
+void bit_array_copy(BIT_ARRAY* dst, bit_index_t dstindx,
+                    const BIT_ARRAY* src, bit_index_t srcindx,
+                    bit_index_t length)
+{
+  assert(srcindx + length <= src->num_of_bits);
+  assert(dstindx <= dst->num_of_bits);
+  _array_copy(dst, dstindx, src, srcindx, length);
+  DEBUG_VALIDATE(dst);
+}
+
+// Clone `src` into `dst`. Resizes `dst`.
+void bit_array_copy_all(BIT_ARRAY* dst, const BIT_ARRAY* src)
+{
+  bit_array_resize_critical(dst, src->num_of_bits);
+  memmove(dst->words, src->words, src->num_of_words * sizeof(word_t));
+  DEBUG_VALIDATE(dst);
+}
+
+
+//
+// Logic operators
+//
+
+// Destination can be the same as one or both of the sources
+void bit_array_and(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2)
+{
+  // Ensure dst array is big enough
+  word_addr_t max_bits = MAX(src1->num_of_bits, src2->num_of_bits);
+  bit_array_ensure_size_critical(dst, max_bits);
+
+  word_addr_t min_words = MIN(src1->num_of_words, src2->num_of_words);
+
+  word_addr_t i;
+
+  for(i = 0; i < min_words; i++)
+  {
+    dst->words[i] = src1->words[i] & src2->words[i];
+  }
+
+  // Set remaining bits to zero
+  for(i = min_words; i < dst->num_of_words; i++)
+  {
+    dst->words[i] = (word_t)0;
+  }
+
+  DEBUG_VALIDATE(dst);
+}
+
+// Destination can be the same as one or both of the sources
+static void _logical_or_xor(BIT_ARRAY* dst,
+                            const BIT_ARRAY* src1,
+                            const BIT_ARRAY* src2,
+                            char use_xor)
+{
+  // Ensure dst array is big enough
+  bit_array_ensure_size_critical(dst, MAX(src1->num_of_bits, src2->num_of_bits));
+
+  word_addr_t min_words = MIN(src1->num_of_words, src2->num_of_words);
+  word_addr_t max_words = MAX(src1->num_of_words, src2->num_of_words);
+
+  word_addr_t i;
+
+  if(use_xor)
+  {
+    for(i = 0; i < min_words; i++)
+      dst->words[i] = src1->words[i] ^ src2->words[i];
+  }
+  else
+  {
+    for(i = 0; i < min_words; i++)
+      dst->words[i] = src1->words[i] | src2->words[i];
+  }
+
+  // Copy remaining bits from longer src array
+  if(min_words != max_words)
+  {
+    const BIT_ARRAY* longer = src1->num_of_words > src2->num_of_words ? src1 : src2;
+
+    for(i = min_words; i < max_words; i++)
+    {
+      dst->words[i] = longer->words[i];
+    }
+  }
+
+  // Set remaining bits to zero
+  size_t size = (dst->num_of_words - max_words) * sizeof(word_t);
+  memset(dst->words + max_words, 0, size);
+
+  DEBUG_VALIDATE(dst);
+}
+
+void bit_array_or(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2)
+{
+  _logical_or_xor(dst, src1, src2, 0);
+}
+
+// Destination can be the same as one or both of the sources
+void bit_array_xor(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2)
+{
+  _logical_or_xor(dst, src1, src2, 1);
+}
+
+// If dst is longer than src, top bits are set to 1
+void bit_array_not(BIT_ARRAY* dst, const BIT_ARRAY* src)
+{
+  bit_array_ensure_size_critical(dst, src->num_of_bits);
+
+  word_addr_t i;
+
+  for(i = 0; i < src->num_of_words; i++)
+  {
+    dst->words[i] = ~(src->words[i]);
+  }
+
+  // Set remaining words to 1s
+  for(i = src->num_of_words; i < dst->num_of_words; i++)
+  {
+    dst->words[i] = WORD_MAX;
+  }
+
+  _mask_top_word(dst);
+
+  DEBUG_VALIDATE(dst);
+}
+
+//
+// Comparisons
+//
+
+// Compare two bit arrays by value stored, with index 0 being the Least
+// Significant Bit (LSB). Arrays do not have to be the same length.
+// Example: ..0101 (5) > ...0011 (3) [index 0 is LSB at right hand side]
+// Sorts on length if all zeros: (0,0) < (0,0,0)
+// returns:
+//  >0 iff bitarr1 > bitarr2
+//   0 iff bitarr1 == bitarr2
+//  <0 iff bitarr1 < bitarr2
+int bit_array_cmp(const BIT_ARRAY* bitarr1, const BIT_ARRAY* bitarr2)
+{
+  word_addr_t i;
+  word_t word1, word2;
+  word_addr_t min_words = bitarr1->num_of_words;
+
+  // i is unsigned so break when i == 0
+  if(bitarr1->num_of_words > bitarr2->num_of_words) {
+    min_words = bitarr2->num_of_words;
+    for(i = bitarr1->num_of_words-1; ; i--) {
+      if(bitarr1->words[i]) return 1;
+      if(i == bitarr2->num_of_words) break;
+    }
+  }
+  else if(bitarr1->num_of_words < bitarr2->num_of_words) {
+    for(i = bitarr2->num_of_words-1; ; i--) {
+      if(bitarr2->words[i]) return 1;
+      if(i == bitarr1->num_of_words) break;
+    }
+  }
+
+  if(min_words == 0) return 0;
+
+  for(i = min_words-1; ; i--)
+  {
+    word1 = bitarr1->words[i];
+    word2 = bitarr2->words[i];
+    if(word1 != word2) return (word1 > word2 ? 1 : -1);
+    if(i == 0) break;
+  }
+
+  if(bitarr1->num_of_bits == bitarr2->num_of_bits) return 0;
+  return bitarr1->num_of_bits > bitarr2->num_of_bits ? 1 : -1;
+}
+
+// Compare two bit arrays by value stored, with index 0 being the Most
+// Significant Bit (MSB). Arrays do not have to be the same length.
+// Example: 10.. > 01.. [index 0 is MSB at left hand side]
+// Sorts on length if all zeros: (0,0) < (0,0,0)
+// returns:
+//  >0 iff bitarr1 > bitarr2
+//   0 iff bitarr1 == bitarr2
+//  <0 iff bitarr1 < bitarr2
+int bit_array_cmp_big_endian(const BIT_ARRAY* bitarr1, const BIT_ARRAY* bitarr2)
+{
+  word_addr_t min_words = MAX(bitarr1->num_of_words, bitarr2->num_of_words);
+
+  word_addr_t i;
+  word_t word1, word2;
+
+  for(i = 0; i < min_words; i++) {
+    word1 = _reverse_word(bitarr1->words[i]);
+    word2 = _reverse_word(bitarr2->words[i]);
+    if(word1 != word2) return (word1 > word2 ? 1 : -1);
+  }
+
+  // Check remaining words. Only one of these loops will execute
+  for(; i < bitarr1->num_of_words; i++)
+    if(bitarr1->words[i]) return 1;
+  for(; i < bitarr2->num_of_words; i++)
+    if(bitarr2->words[i]) return -1;
+
+  if(bitarr1->num_of_bits == bitarr2->num_of_bits) return 0;
+  return bitarr1->num_of_bits > bitarr2->num_of_bits ? 1 : -1;
+}
+
+// compare bitarr with (bitarr2 << pos)
+// bit_array_cmp(bitarr1, bitarr2<<pos)
+// returns:
+//  >0 iff bitarr1 > bitarr2
+//   0 iff bitarr1 == bitarr2
+//  <0 iff bitarr1 < bitarr2
+int bit_array_cmp_words(const BIT_ARRAY *arr1,
+                        bit_index_t pos, const BIT_ARRAY *arr2)
+{
+  if(arr1->num_of_bits == 0 && arr2->num_of_bits == 0)
+  {
+    return 0;
+  }
+
+  bit_index_t top_bit1 = 0, top_bit2 = 0;
+
+  char arr1_zero = !bit_array_find_last_set_bit(arr1, &top_bit1);
+  char arr2_zero = !bit_array_find_last_set_bit(arr2, &top_bit2);
+
+  if(arr1_zero && arr2_zero) return 0;
+  if(arr1_zero) return -1;
+  if(arr2_zero) return 1;
+
+  bit_index_t top_bit2_offset = top_bit2 + pos;
+
+  if(top_bit1 != top_bit2_offset) {
+    return top_bit1 > top_bit2_offset ? 1 : -1;
+  }
+
+  word_addr_t i;
+  word_t word1, word2;
+
+  for(i = top_bit2 / WORD_SIZE; i > 0; i--)
+  {
+    word1 = _get_word(arr1, pos + i * WORD_SIZE);
+    word2 = arr2->words[i];
+
+    if(word1 > word2) return 1;
+    if(word1 < word2) return -1;
+  }
+
+  word1 = _get_word(arr1, pos);
+  word2 = arr2->words[0];
+
+  if(word1 > word2) return 1;
+  if(word1 < word2) return -1;
+
+  // return 1 if arr1[0..pos] != 0, 0 otherwise
+
+  // Whole words
+  word_addr_t num_words = pos / WORD_SIZE;
+
+  for(i = 0; i < num_words; i++)
+  {
+    if(arr1->words[i] > 0)
+    {
+      return 1;
+    }
+  }
+
+  word_offset_t bits_remaining = pos - num_words * WORD_SIZE;
+
+  if(arr1->words[num_words] & bitmask64(bits_remaining))
+  {
+    return 1;
+  }
+
+  return 0;
+}
+
+
+//
+// Reverse -- coords may wrap around
+//
+
+// No bounds checking
+// length cannot be zero
+static void _reverse_region(BIT_ARRAY* bitarr,
+                            bit_index_t start,
+                            bit_index_t length)
+{
+  bit_index_t left = start;
+  bit_index_t right = (start + length - WORD_SIZE) % bitarr->num_of_bits;
+
+  while(length >= 2 * WORD_SIZE)
+  {
+    // Swap entire words
+    word_t left_word = _get_word_cyclic(bitarr, left);
+    word_t right_word = _get_word_cyclic(bitarr, right);
+
+    // reverse words individually
+    left_word = _reverse_word(left_word);
+    right_word = _reverse_word(right_word);
+
+    // Swap
+    _set_word_cyclic(bitarr, left, right_word);
+    _set_word_cyclic(bitarr, right, left_word);
+
+    // Update
+    left = (left + WORD_SIZE) % bitarr->num_of_bits;
+    right = (right < WORD_SIZE ? right + bitarr->num_of_bits : right) - WORD_SIZE;
+    length -= 2 * WORD_SIZE;
+  }
+
+  word_t word, rev;
+
+  if(length == 0)
+  {
+    return;
+  }
+  else if(length > WORD_SIZE)
+  {
+    // Words overlap
+    word_t left_word = _get_word_cyclic(bitarr, left);
+    word_t right_word = _get_word_cyclic(bitarr, right);
+
+    rev = _reverse_word(left_word);
+    right_word = _reverse_word(right_word);
+
+    // fill left 64 bits with right word rev
+    _set_word_cyclic(bitarr, left, right_word);
+
+    // Now do remaining bits (length is between 1 and 64 bits)
+    left += WORD_SIZE;
+    length -= WORD_SIZE;
+
+    word = _get_word_cyclic(bitarr, left);
+  }
+  else
+  {
+    word = _get_word_cyclic(bitarr, left);
+    rev = _reverse_word(word);
+  }
+
+  rev >>= WORD_SIZE - length;
+  word_t mask = bitmask64(length);
+
+  word = bitmask_merge(rev, word, mask);
+
+  _set_word_cyclic(bitarr, left, word);
+}
+
+void bit_array_reverse_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len)
+{
+  assert(start + len <= bitarr->num_of_bits);
+  if(len > 0) _reverse_region(bitarr, start, len);
+  DEBUG_VALIDATE(bitarr);
+}
+
+void bit_array_reverse(BIT_ARRAY* bitarr)
+{
+  if(bitarr->num_of_bits > 0) _reverse_region(bitarr, 0, bitarr->num_of_bits);
+  DEBUG_VALIDATE(bitarr);
+}
+
+//
+// Shift left / right
+//
+
+// Shift towards MSB / higher index
+void bit_array_shift_left(BIT_ARRAY* bitarr, bit_index_t shift_dist, char fill)
+{
+  if(shift_dist >= bitarr->num_of_bits)
+  {
+    fill ? bit_array_set_all(bitarr) : bit_array_clear_all(bitarr);
+    return;
+  }
+  else if(shift_dist == 0)
+  {
+    return;
+  }
+
+  FillAction action = fill ? FILL_REGION : ZERO_REGION;
+
+  bit_index_t cpy_length = bitarr->num_of_bits - shift_dist;
+  _array_copy(bitarr, shift_dist, bitarr, 0, cpy_length);
+  _set_region(bitarr, 0, shift_dist, action);
+}
+
+// shift left extend - don't truncate bits when shifting UP, instead
+// make room for them.
+void bit_array_shift_left_extend(BIT_ARRAY* bitarr, bit_index_t shift_dist,
+                                 char fill)
+{
+   bit_index_t newlen = bitarr->num_of_bits + shift_dist;
+   bit_index_t cpy_length = bitarr->num_of_bits;
+
+  if(shift_dist == 0)
+  {
+    return;
+  }
+
+  bit_array_resize_critical(bitarr, newlen);
+
+  FillAction action = fill ? FILL_REGION : ZERO_REGION;
+  _array_copy(bitarr, shift_dist, bitarr, 0, cpy_length);
+  _set_region(bitarr, 0, shift_dist, action);
+}
+
+// Shift towards LSB / lower index
+void bit_array_shift_right(BIT_ARRAY* bitarr, bit_index_t shift_dist, char fill)
+{
+  if(shift_dist >= bitarr->num_of_bits)
+  {
+    fill ? bit_array_set_all(bitarr) : bit_array_clear_all(bitarr);
+    return;
+  }
+  else if(shift_dist == 0)
+  {
+    return;
+  }
+
+  FillAction action = fill ? FILL_REGION : ZERO_REGION;
+
+  bit_index_t cpy_length = bitarr->num_of_bits - shift_dist;
+  bit_array_copy(bitarr, 0, bitarr, shift_dist, cpy_length);
+
+  _set_region(bitarr, cpy_length, shift_dist, action);
+}
+
+//
+// Cycle
+//
+
+// Cycle towards index 0
+void bit_array_cycle_right(BIT_ARRAY* bitarr, bit_index_t cycle_dist)
+{
+  if(bitarr->num_of_bits == 0)
+  {
+    return;
+  }
+
+  cycle_dist = cycle_dist % bitarr->num_of_bits;
+
+  if(cycle_dist == 0)
+  {
+    return;
+  }
+
+  bit_index_t len1 = cycle_dist;
+  bit_index_t len2 = bitarr->num_of_bits - cycle_dist;
+
+  _reverse_region(bitarr, 0, len1);
+  _reverse_region(bitarr, len1, len2);
+  bit_array_reverse(bitarr);
+}
+
+// Cycle away from index 0
+void bit_array_cycle_left(BIT_ARRAY* bitarr, bit_index_t cycle_dist)
+{
+  if(bitarr->num_of_bits == 0)
+  {
+    return;
+  }
+
+  cycle_dist = cycle_dist % bitarr->num_of_bits;
+
+  if(cycle_dist == 0)
+  {
+    return;
+  }
+
+  bit_index_t len1 = bitarr->num_of_bits - cycle_dist;
+  bit_index_t len2 = cycle_dist;
+
+  _reverse_region(bitarr, 0, len1);
+  _reverse_region(bitarr, len1, len2);
+  bit_array_reverse(bitarr);
+}
+
+//
+// Next permutation
+//
+
+static word_t _next_permutation(word_t v)
+{
+  // From http://graphics.stanford.edu/~seander/bithacks.html#NextBitPermutation
+  word_t t = v | (v - 1); // t gets v's least significant 0 bits set to 1
+  // Next set to 1 the most significant bit to change,
+  // set to 0 the least significant ones, and add the necessary 1 bits.
+  return (t+1) | (((~t & (t+1)) - 1) >> (trailing_zeros(v) + 1));
+}
+
+// Get the next permutation of an array with a fixed size and given number of
+// bits set.  Also known as next lexicographic permutation.
+// Given a bit array find the next lexicographic orginisation of the bits
+// Number of possible combinations given by (size choose bits_set) i.e. nCk
+// 00011 -> 00101 -> 00110 -> 01001 -> 01010 ->
+// 01100 -> 10001 -> 10010 -> 10100 -> 11000 -> 00011 (back to start)
+void bit_array_next_permutation(BIT_ARRAY* bitarr)
+{
+  if(bitarr->num_of_bits == 0)
+  {
+    return;
+  }
+
+  word_addr_t w;
+
+  char carry = 0;
+  word_offset_t top_bits = bitset64_idx(bitarr->num_of_bits);
+
+  for(w = 0; w < bitarr->num_of_words; w++)
+  {
+    word_t mask
+      = (w < bitarr->num_of_words - 1 || top_bits == 0) ? WORD_MAX
+                                                        : bitmask64(top_bits);
+
+    if(bitarr->words[w] > 0 &&
+       (bitarr->words[w] | (bitarr->words[w]-1)) == mask)
+    {
+      // Bits in this word cannot be moved forward
+      carry = 1;
+    }
+    else if(carry)
+    {
+      // 0111 -> 1000, 1000 -> 1001
+      word_t tmp = bitarr->words[w] + 1;
+
+      // Count bits previously set
+      bit_index_t bits_previously_set = POPCOUNT(bitarr->words[w]);
+
+      // set new word
+      bitarr->words[w] = tmp;
+
+      // note: w is unsigned
+      // Zero words while counting bits set
+      while(w > 0)
+      {
+        bits_previously_set += POPCOUNT(bitarr->words[w-1]);
+        bitarr->words[w-1] = 0;
+        w--;
+      }
+
+      // Set bits at the beginning
+      SET_REGION(bitarr, 0, bits_previously_set - POPCOUNT(tmp));
+
+      carry = 0;
+      break;
+    }
+    else if(bitarr->words[w] > 0)
+    {
+      bitarr->words[w] = _next_permutation(bitarr->words[w]);
+      break;
+    }
+  }
+
+  if(carry)
+  {
+    // Loop around
+    bit_index_t num_bits_set = bit_array_num_bits_set(bitarr);
+    bit_array_clear_all(bitarr);
+    SET_REGION(bitarr, 0, num_bits_set);
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+
+//
+// Interleave
+//
+
+// dst cannot point to the same bit array as src1 or src2
+// src1, src2 may point to the same bit array
+// abcd 1234 -> a1b2c3d4
+// 0011 0000 -> 00001010
+// 1111 0000 -> 10101010
+// 0101 1010 -> 01100110
+void bit_array_interleave(BIT_ARRAY* dst,
+                          const BIT_ARRAY* src1,
+                          const BIT_ARRAY* src2)
+{
+  // dst cannot be either src1 or src2
+  assert(dst != src1 && dst != src2);
+  // Behaviour undefined when src1 length != src2 length",
+  assert(src1->num_of_bits == src2->num_of_bits);
+
+  // Need at least src1->num_of_words + src2->num_of_words
+  size_t nwords = MIN(src1->num_of_words + src2->num_of_words, 2);
+  _bit_array_ensure_nwords(dst, nwords, __FILE__, __LINE__, __func__);
+  dst->num_of_bits = src1->num_of_bits + src2->num_of_bits;
+  dst->num_of_words = roundup_bits2words64(dst->num_of_bits);
+
+  word_addr_t i, j;
+
+  for(i = 0, j = 0; i < src1->num_of_words; i++)
+  {
+    word_t a = src1->words[i];
+    word_t b = src2->words[i];
+
+    dst->words[j++] =  morton_table0[(a      ) & 0xff] |
+                       morton_table1[(b      ) & 0xff] |
+                      (morton_table0[(a >>  8) & 0xff] << 16) |
+                      (morton_table1[(b >>  8) & 0xff] << 16) |
+                      (morton_table0[(a >> 16) & 0xff] << 32) |
+                      (morton_table1[(b >> 16) & 0xff] << 32) |
+                      (morton_table0[(a >> 24) & 0xff] << 48) |
+                      (morton_table1[(b >> 24) & 0xff] << 48);
+
+    dst->words[j++] =  morton_table0[(a >> 32) & 0xff] |
+                       morton_table1[(b >> 32) & 0xff] |
+                      (morton_table0[(a >> 40) & 0xff] << 16) |
+                      (morton_table1[(b >> 40) & 0xff] << 16) |
+                      (morton_table0[(a >> 48) & 0xff] << 32) |
+                      (morton_table1[(b >> 48) & 0xff] << 32) |
+                      (morton_table0[(a >> 56)       ] << 48) |
+                      (morton_table1[(b >> 56)       ] << 48);
+  }
+
+  DEBUG_VALIDATE(dst);
+}
+
+//
+// Random
+//
+
+// Set bits randomly with probability prob : 0 <= prob <= 1
+void bit_array_random(BIT_ARRAY* bitarr, float prob)
+{
+  assert(prob >= 0 && prob <= 1);
+
+  if(bitarr->num_of_bits == 0)
+  {
+    return;
+  }
+  else if(prob == 1)
+  {
+    bit_array_set_all(bitarr);
+    return;
+  }
+
+  // rand() generates number between 0 and RAND_MAX inclusive
+  // therefore we want to check if rand() <= p
+  long p = RAND_MAX * prob;
+
+  _seed_rand();
+
+  word_addr_t w;
+  word_offset_t o;
+
+  // Initialise to zero
+  memset(bitarr->words, 0, bitarr->num_of_words * sizeof(word_t));
+
+  for(w = 0; w < bitarr->num_of_words - 1; w++)
+  {
+    for(o = 0; o < WORD_SIZE; o++)
+    {
+      if(rand() <= p)
+      {
+        bitarr->words[w] |= ((word_t)0x1 << o);
+      }
+    }
+  }
+
+  // Top word
+  word_offset_t bits_in_last_word = bits_in_top_word(bitarr->num_of_bits);
+  w = bitarr->num_of_words - 1;
+
+  for(o = 0; o < bits_in_last_word; o++)
+  {
+    if(rand() <= p)
+    {
+      bitarr->words[w] |= ((word_t)0x1 << o);
+    }
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Shuffle the bits in an array randomly
+void bit_array_shuffle(BIT_ARRAY* bitarr)
+{
+  if(bitarr->num_of_bits == 0)
+    return;
+
+  _seed_rand();
+
+  bit_index_t i, j;
+
+  for(i = bitarr->num_of_bits - 1; i > 0; i--)
+  {
+    j = (bit_index_t)rand() % i;
+
+    // Swap i and j
+    char x = (bitarr->words[bitset64_wrd(i)] >> bitset64_idx(i)) & 0x1;
+    char y = (bitarr->words[bitset64_wrd(j)] >> bitset64_idx(j)) & 0x1;
+
+    if(!y)
+      bitarr->words[bitset64_wrd(i)] &= ~((word_t)0x1 << bitset64_idx(i));
+    else
+      bitarr->words[bitset64_wrd(i)] |= (word_t)0x1 << bitset64_idx(i);
+
+    if(!x)
+      bitarr->words[bitset64_wrd(j)] &= ~((word_t)0x1 << bitset64_idx(j));
+    else
+      bitarr->words[bitset64_wrd(j)] |= (word_t)0x1 << bitset64_idx(j);
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+//
+// Arithmetic
+//
+
+// Returns 1 on sucess, 0 if value in array is too big
+char bit_array_as_num(const BIT_ARRAY* bitarr, uint64_t* result)
+{
+  if(bitarr->num_of_bits == 0)
+  {
+    *result = 0;
+    return 1;
+  }
+
+  word_addr_t w;
+
+  for(w = bitarr->num_of_words-1; w > 0; w--)
+  {
+    if(bitarr->words[w] > 0)
+    {
+      return 0;
+    }
+  }
+
+  *result = bitarr->words[0];
+  return 1;
+}
+
+
+// 1 iff bitarr > value
+// 0 iff bitarr == value
+// -1 iff bitarr < value
+int bit_array_cmp_uint64(const BIT_ARRAY* bitarr, uint64_t value)
+{
+  uint64_t arr_num = 0;
+
+  // If cannot put bitarr in uint64, it is > value
+  if(!bit_array_as_num(bitarr, &arr_num)) return 1;
+
+  if(arr_num > value)      return  1;
+  else if(arr_num < value) return -1;
+  else                     return  0;
+}
+
+// If value is zero, no change is made
+void bit_array_add_uint64(BIT_ARRAY* bitarr, uint64_t value)
+{
+  if(value == 0)
+  {
+    return;
+  }
+  else if(bitarr->num_of_bits == 0)
+  {
+    bit_array_resize_critical(bitarr, WORD_SIZE - leading_zeros(value));
+    bitarr->words[0] = (word_t)value;
+    return;
+  }
+
+  char carry = 0;
+  word_addr_t i;
+
+  for(i = 0; i < bitarr->num_of_words; i++)
+  {
+    if(WORD_MAX - bitarr->words[i] < value)
+    {
+      carry = 1;
+      bitarr->words[i] += value;
+    }
+    else
+    {
+      // Carry is absorbed
+      bitarr->words[i] += value;
+      carry = 0;
+      break;
+    }
+  }
+
+  if(carry)
+  {
+    // Bit array full, need another bit after all words filled
+    bit_array_resize_critical(bitarr, bitarr->num_of_words * WORD_SIZE + 1);
+
+    // Set top word to 1
+    bitarr->words[bitarr->num_of_words-1] = 1;
+  }
+  else
+  {
+    word_t final_word = bitarr->words[bitarr->num_of_words-1];
+    word_offset_t expected_bits = bits_in_top_word(bitarr->num_of_bits);
+    word_offset_t actual_bits = WORD_SIZE - leading_zeros(final_word);
+
+    if(actual_bits > expected_bits)
+    {
+      // num_of_bits has increased -- num_of_words has not
+      bitarr->num_of_bits += (actual_bits - expected_bits);
+    }
+  }
+}
+
+// If value is greater than bitarr, bitarr is not changed and 0 is returned
+// Returns 1 on success, 0 if value > bitarr
+char bit_array_sub_uint64(BIT_ARRAY* bitarr, uint64_t value)
+{
+  if(value == 0)
+  {
+    return 1;
+  }
+  else if(bitarr->words[0] >= value)
+  {
+    bitarr->words[0] -= value;
+    return 1;
+  }
+
+  value -= bitarr->words[0];
+
+  word_addr_t i;
+
+  for(i = 1; i < bitarr->num_of_words; i++)
+  {
+    if(bitarr->words[i] > 0)
+    {
+      // deduct one
+      bitarr->words[i]--;
+
+      for(; i > 0; i--)
+      {
+        bitarr->words[i] = WORD_MAX;
+      }
+
+      // -1 since we've already deducted 1
+      bitarr->words[0] = WORD_MAX - value - 1;
+
+      return 1;
+    }
+  }
+
+  // subtract value is greater than array
+  return 0;
+}
+
+//
+// Arithmetic between bit arrays
+//
+
+// src1, src2 and dst can all be the same BIT_ARRAY
+static void _arithmetic(BIT_ARRAY* dst,
+                        const BIT_ARRAY* src1,
+                        const BIT_ARRAY* src2,
+                        char subtract)
+{
+  word_addr_t max_words = MAX(src1->num_of_words, src2->num_of_words);
+
+  // Adding: dst_words >= max(src1 words, src2 words)
+  // Subtracting: dst_words is >= src1->num_of_words
+
+  char carry = subtract ? 1 : 0;
+
+  word_addr_t i;
+  word_t word1, word2;
+
+  for(i = 0; i < max_words; i++)
+  {
+    word1 = (i < src1->num_of_words ? src1->words[i] : 0);
+    word2 = (i < src2->num_of_words ? src2->words[i] : 0);
+
+    if(subtract)
+      word2 = ~word2;
+
+    dst->words[i] = word1 + word2 + carry;
+    // Update carry
+    carry = WORD_MAX - word1 < word2 || WORD_MAX - word1 - word2 < (word_t)carry;
+  }
+
+  if(subtract)
+  {
+    carry = 0;
+  }
+  else
+  {
+    // Check last word
+    word_offset_t bits_on_last_word = bits_in_top_word(dst->num_of_bits);
+
+    if(bits_on_last_word < WORD_SIZE)
+    {
+      word_t mask = bitmask64(bits_on_last_word);
+
+      if(dst->words[max_words-1] > mask)
+      {
+        // Array has overflowed, increase size
+        dst->num_of_bits++;
+      }
+    }
+    else if(carry)
+    {
+      // Carry onto a new word
+      if(dst->num_of_words == max_words)
+      {
+        // Need to resize for the carry bit
+        bit_array_resize_critical(dst, dst->num_of_bits+1);
+      }
+
+      dst->words[max_words] = (word_t)1;
+    }
+  }
+
+  // Zero the rest of dst array
+  for(i = max_words+carry; i < dst->num_of_words; i++)
+  {
+    dst->words[i] = (word_t)0;
+  }
+
+  DEBUG_VALIDATE(dst);
+}
+
+// src1, src2 and dst can all be the same BIT_ARRAY
+// If dst is shorter than either of src1, src2, it is enlarged
+void bit_array_add(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2)
+{
+  bit_array_ensure_size_critical(dst, MAX(src1->num_of_bits, src2->num_of_bits));
+  _arithmetic(dst, src1, src2, 0);
+}
+
+// dst = src1 - src2
+// src1, src2 and dst can all be the same BIT_ARRAY
+// If dst is shorter than src1, it will be extended to be as long as src1
+// src1 must be greater than or equal to src2 (src1 >= src2)
+void bit_array_subtract(BIT_ARRAY* dst,
+                          const BIT_ARRAY* src1, const BIT_ARRAY* src2)
+{
+  // subtraction by method of complements:
+  // a - b = a + ~b + 1 = src1 + ~src2 +1
+
+  assert(bit_array_cmp(src1, src2) >= 0); // Require src1 >= src2
+
+  bit_array_ensure_size_critical(dst, src1->num_of_bits);
+  _arithmetic(dst, src1, src2, 1);
+}
+
+
+// Add `add` to `bitarr` at `pos`
+// Bounds checking not needed as out of bounds is valid
+void bit_array_add_word(BIT_ARRAY *bitarr, bit_index_t pos, uint64_t add)
+{
+  DEBUG_VALIDATE(bitarr);
+
+  if(add == 0)
+  {
+    return;
+  }
+  else if(pos >= bitarr->num_of_bits)
+  {
+    // Resize and add!
+    bit_index_t num_bits_required = pos + (WORD_SIZE - leading_zeros(add));
+    bit_array_resize_critical(bitarr, num_bits_required);
+    _set_word(bitarr, pos, (word_t)add);
+    return;
+  }
+
+  /*
+  char str[1000];
+  printf(" add_word: %s\n", bit_array_to_str_rev(bitarr, str));
+  printf("     word: %s [pos: %i]\n", _word_to_str(add, str), (int)pos);
+  */
+
+  word_t w = _get_word(bitarr, pos);
+  word_t sum = w + add;
+  char carry = WORD_MAX - w < add;
+
+  // Ensure array is big enough
+  bit_index_t num_bits_required = pos + (carry ? WORD_SIZE + 1
+                                               : (WORD_SIZE - leading_zeros(sum)));
+
+  bit_array_ensure_size(bitarr, num_bits_required);
+
+  _set_word(bitarr, pos, sum);
+  pos += WORD_SIZE;
+
+  if(carry)
+  {
+    word_offset_t offset = pos % WORD_SIZE;
+    word_addr_t addr = bitset64_wrd(pos);
+
+    add = (word_t)0x1 << offset;
+    carry = (WORD_MAX - bitarr->words[addr] < add);
+    sum = bitarr->words[addr] + add;
+
+    num_bits_required = addr * WORD_SIZE +
+                        (carry ? WORD_SIZE + 1 : (WORD_SIZE - leading_zeros(sum)));
+
+    bit_array_ensure_size(bitarr, num_bits_required);
+
+    bitarr->words[addr++] = sum;
+
+    if(carry)
+    {
+      while(addr < bitarr->num_of_words && bitarr->words[addr] == WORD_MAX)
+      {
+        bitarr->words[addr++] = 0;
+      }
+
+      if(addr == bitarr->num_of_words)
+      {
+        bit_array_resize_critical(bitarr, addr * WORD_SIZE + 1);
+      }
+      else if(addr == bitarr->num_of_words-1 &&
+              bitarr->words[addr] == bitmask64(bits_in_top_word(bitarr->num_of_bits)))
+      {
+        bit_array_resize_critical(bitarr, bitarr->num_of_bits + 1);
+      }
+
+      bitarr->words[addr]++;
+    }
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+// Add `add` to `bitarr` at `pos`
+// Bounds checking not needed as out of bounds is valid
+void bit_array_add_words(BIT_ARRAY *bitarr, bit_index_t pos, const BIT_ARRAY *add)
+{
+  assert(bitarr != add); // bitarr and add cannot point to the same bit array
+
+  bit_index_t add_top_bit_set;
+
+  if(!bit_array_find_last_set_bit(add, &add_top_bit_set))
+  {
+    // No bits set in add
+    return;
+  }
+  else if(pos >= bitarr->num_of_bits)
+  {
+    // Just resize and copy!
+    bit_index_t num_bits_required = pos + add_top_bit_set + 1;
+    bit_array_resize_critical(bitarr, num_bits_required);
+    _array_copy(bitarr, pos, add, 0, add->num_of_bits);
+    return;
+  }
+  else if(pos == 0)
+  {
+    bit_array_add(bitarr, bitarr, add);
+    return;
+  }
+
+  /*
+  char str[1000];
+  printf(" add_words1: %s\n", bit_array_to_str_rev(bitarr, str));
+  printf(" add_words2: %s\n", bit_array_to_str_rev(add, str));
+  printf(" [pos: %i]\n", (int)pos);
+  */
+
+  bit_index_t num_bits_required = pos + add_top_bit_set + 1;
+  bit_array_ensure_size(bitarr, num_bits_required);
+
+  word_addr_t first_word = bitset64_wrd(pos);
+  word_offset_t first_offset = bitset64_idx(pos);
+
+  word_t w = add->words[0] << first_offset;
+  unsigned char carry = (WORD_MAX - bitarr->words[first_word] < w);
+
+  bitarr->words[first_word] += w;
+
+  word_addr_t i = first_word + 1;
+  bit_index_t offset = WORD_SIZE - first_offset;
+
+  for(; carry || offset <= add_top_bit_set; i++, offset += WORD_SIZE)
+  {
+    w = offset < add->num_of_bits ? _get_word(add, offset) : (word_t)0;
+
+    if(i >= bitarr->num_of_words)
+    {
+      // Extend by a word
+      bit_array_resize_critical(bitarr, (bit_index_t)(i+1)*WORD_SIZE+1);
+    }
+
+    word_t prev = bitarr->words[i];
+
+    bitarr->words[i] += w + carry;
+
+    carry = (WORD_MAX - prev < w || (carry && prev + w == WORD_MAX)) ? 1 : 0;
+  }
+
+  word_offset_t top_bits
+    = WORD_SIZE - leading_zeros(bitarr->words[bitarr->num_of_words-1]);
+
+  bit_index_t min_bits = (bitarr->num_of_words-1)*WORD_SIZE + top_bits;
+
+  if(bitarr->num_of_bits < min_bits)
+  {
+    // Extend within the last word
+    bitarr->num_of_bits = min_bits;
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+char bit_array_sub_word(BIT_ARRAY* bitarr, bit_index_t pos, word_t minus)
+{
+  DEBUG_VALIDATE(bitarr);
+
+  if(minus == 0)
+  {
+    return 1;
+  }
+
+  word_t w = _get_word(bitarr, pos);
+
+  if(w >= minus)
+  {
+    _set_word(bitarr, pos, w - minus);
+    DEBUG_VALIDATE(bitarr);
+    return 1;
+  }
+
+  minus -= w;
+
+  bit_index_t offset;
+  for(offset = pos + WORD_SIZE; offset < bitarr->num_of_bits; offset += WORD_SIZE)
+  {
+    w = _get_word(bitarr, offset);
+
+    if(w > 0)
+    {
+      // deduct one
+      _set_word(bitarr, offset, w - 1);
+
+      SET_REGION(bitarr, pos, offset-pos);
+
+      // -1 since we've already deducted 1
+      minus--;
+
+      _set_word(bitarr, pos, WORD_MAX - minus);
+
+      DEBUG_VALIDATE(bitarr);
+      return 1;
+    }
+  }
+
+  DEBUG_VALIDATE(bitarr);
+
+  return 0;
+}
+
+char bit_array_sub_words(BIT_ARRAY* bitarr, bit_index_t pos, BIT_ARRAY* minus)
+{
+  assert(bitarr != minus); // bitarr and minus cannot point to the same bit array
+
+  int cmp = bit_array_cmp_words(bitarr, pos, minus);
+
+  if(cmp == 0)
+  {
+    bit_array_clear_all(bitarr);
+    return 1;
+  }
+  else if(cmp < 0)
+  {
+    return 0;
+  }
+
+  bit_index_t bitarr_length = bitarr->num_of_bits;
+
+  bit_index_t bitarr_top_bit_set;
+  bit_array_find_last_set_bit(bitarr, &bitarr_top_bit_set);
+
+  // subtraction by method of complements:
+  // a - b = a + ~b + 1 = src1 + ~src2 +1
+
+  bit_array_not(minus, minus);
+
+  bit_array_add_words(bitarr, pos, minus);
+  bit_array_add_word(bitarr, pos, (word_t)1);
+
+  bit_array_sub_word(bitarr, pos+minus->num_of_bits, 1);
+  bit_array_resize(bitarr, bitarr_length);
+
+  bit_array_not(minus, minus);
+
+  DEBUG_VALIDATE(bitarr);
+
+  return 1;
+}
+
+void bit_array_mul_uint64(BIT_ARRAY *bitarr, uint64_t multiplier)
+{
+  if(bitarr->num_of_bits == 0 || multiplier == 1)
+  {
+    return;
+  }
+  else if(multiplier == 0)
+  {
+    bit_array_clear_all(bitarr);
+    return;
+  }
+
+  bit_index_t i;
+
+  for(i = bitarr->num_of_bits; i > 0; i--)
+  {
+    if(bit_array_get(bitarr, i-1))
+    {
+      bit_array_clear(bitarr, i-1);
+      bit_array_add_word(bitarr, i-1, multiplier);
+    }
+  }
+
+  DEBUG_VALIDATE(bitarr);
+}
+
+void bit_array_multiply(BIT_ARRAY *dst, BIT_ARRAY *src1, BIT_ARRAY *src2)
+{
+  if(src1->num_of_bits == 0 || src2->num_of_bits == 0)
+  {
+    bit_array_clear_all(dst);
+    return;
+  }
+
+  // Cannot pass the same array as dst, src1 AND src2
+  assert(dst != src1 || dst != src2);
+
+  // Dev: multiplier == 1?
+
+  BIT_ARRAY *read_arr, *add_arr;
+
+  if(src1 == dst)
+  {
+    read_arr = src1;
+    add_arr = src2;
+  }
+  else
+  {
+    read_arr = src2;
+    add_arr = src1;
+  }
+
+  if(dst != src1 && dst != src2)
+  {
+    bit_array_clear_all(dst);
+  }
+
+  bit_index_t i;
+
+  for(i = read_arr->num_of_bits; i > 0; i--)
+  {
+    if(bit_array_get(read_arr, i-1))
+    {
+      bit_array_clear(dst, i-1);
+      bit_array_add_words(dst, i-1, add_arr);
+    }
+  }
+
+  DEBUG_VALIDATE(dst);
+}
+
+// bitarr = round_down(bitarr / divisor)
+// rem = bitarr % divisor
+void bit_array_div_uint64(BIT_ARRAY *bitarr, uint64_t divisor, uint64_t *rem)
+{
+  assert(divisor != 0); // cannot divide by zero
+
+  bit_index_t div_top_bit = 63 - leading_zeros(divisor);
+  bit_index_t bitarr_top_bit;
+
+  if(!bit_array_find_last_set_bit(bitarr, &bitarr_top_bit))
+  {
+    *rem = 0;
+    return;
+  }
+
+  if(bitarr_top_bit < div_top_bit)
+  {
+    *rem = bitarr->words[0];
+    bit_array_clear_all(bitarr);
+    return;
+  }
+
+  // When div is shifted by offset, their top set bits are aligned
+  bit_index_t offset = bitarr_top_bit - div_top_bit;
+
+  uint64_t tmp = _get_word(bitarr, offset);
+  _set_word(bitarr, offset, (word_t)0);
+
+  // Carry if 1 if the top bit was set before left shift
+  char carry = 0;
+
+  // offset unsigned so break when offset == 0
+  while(1)
+  {
+    if(carry)
+    {
+      // (carry:tmp) - divisor = (WORD_MAX+1+tmp)-divisor
+      tmp = WORD_MAX - divisor + tmp + 1;
+      bit_array_set(bitarr, offset);
+    }
+    else if(tmp >= divisor)
+    {
+      tmp -= divisor;
+      bit_array_set(bitarr, offset);
+    }
+    else
+    {
+      bit_array_clear(bitarr, offset);
+    }
+
+    if(offset == 0)
+      break;
+
+    offset--;
+
+    // Is the top bit set (that we're about to shift off)?
+    carry = tmp & 0x8000000000000000;
+
+    tmp <<= 1;
+    tmp |= bit_array_get(bitarr, offset);
+  }
+
+  *rem = tmp;
+}
+
+// Results in:
+//   quotient = dividend / divisor
+//   dividend = dividend % divisor
+// (dividend is used to return the remainder)
+void bit_array_divide(BIT_ARRAY *dividend, BIT_ARRAY *quotient, BIT_ARRAY *divisor)
+{
+  assert(bit_array_cmp_uint64(divisor, 0) != 0); // Cannot divide by zero
+
+  bit_array_clear_all(quotient);
+
+  int cmp = bit_array_cmp(dividend, divisor);
+
+  if(cmp == 0)
+  {
+    bit_array_ensure_size(quotient, 1);
+    bit_array_set(quotient, 0);
+    bit_array_clear_all(dividend);
+    return;
+  }
+  else if(cmp < 0)
+  {
+    // dividend is < divisor, quotient is zero -- done
+    return;
+  }
+
+  // now we know: dividend > divisor, quotient is zero'd,
+  //              dividend != 0, divisor != 0
+  bit_index_t dividend_top_bit = 0, div_top_bit = 0;
+
+  bit_array_find_last_set_bit(dividend, &dividend_top_bit);
+  bit_array_find_last_set_bit(divisor, &div_top_bit);
+
+  // When divisor is shifted by offset, their top set bits are aligned
+  bit_index_t offset = dividend_top_bit - div_top_bit;
+
+  // offset unsigned so break when offset == 0
+  for(; ; offset--)
+  {
+    if(bit_array_cmp_words(dividend, offset, divisor) >= 0)
+    {
+      bit_array_sub_words(dividend, offset, divisor);
+      bit_array_ensure_size(quotient, offset+1);
+      bit_array_set(quotient, offset);
+    }
+
+    if(offset == 0)
+      break;
+  }
+}
+
+//
+// Read/Write from files
+//
+// file format is [8 bytes: for number of elements in array][data]
+// data is written in little endian order (least sig byte first)
+//
+
+// Saves bit array to a file. Returns the number of bytes written
+// number of bytes returned should be 8+(bitarr->num_of_bits+7)/8
+bit_index_t bit_array_save(const BIT_ARRAY* bitarr, FILE* f)
+{
+  bit_index_t num_of_bytes = roundup_bits2bytes(bitarr->num_of_bits);
+  bit_index_t bytes_written = 0;
+
+  const int endian = 1;
+  if(*(uint8_t*)&endian == 1)
+  {
+    // Little endian machine
+    // Write 8 bytes to store the number of bits in the array
+    bytes_written += fwrite(&bitarr->num_of_bits, 1, 8, f);
+
+    // Write the array
+    bytes_written += fwrite(bitarr->words, 1, num_of_bytes, f);
+  }
+  else
+  {
+    // Big endian machine
+    uint64_t i, w, whole_words = num_of_bytes/sizeof(word_t);
+    uint64_t rem_bytes = num_of_bytes - whole_words*sizeof(word_t);
+    uint64_t n_bits = byteswap64(bitarr->num_of_bits);
+
+    // Write 8 bytes to store the number of bits in the array
+    bytes_written += fwrite(&n_bits, 1, 8, f);
+
+    // Write the array
+    for(i = 0; i < whole_words; i++) {
+      w = byteswap64(bitarr->words[i]);
+      bytes_written += fwrite(&w, 1, 8, f);
+    }
+
+    if(rem_bytes > 0) {
+      w = byteswap64(bitarr->words[whole_words]);
+      bytes_written += fwrite(&w, 1, rem_bytes, f);
+    }
+  }
+
+  return bytes_written;
+}
+
+// Load a uint64 from little endian format.
+// Works for both big and little endian architectures
+static inline uint64_t le64_to_cpu(const uint8_t *x)
+{
+  return (((uint64_t)(x[0]))       | ((uint64_t)(x[1]) << 8)  |
+          ((uint64_t)(x[2]) << 16) | ((uint64_t)(x[3]) << 24) |
+          ((uint64_t)(x[4]) << 32) | ((uint64_t)(x[5]) << 40) |
+          ((uint64_t)(x[6]) << 48) | ((uint64_t)(x[7]) << 56));
+}
+
+// Reads bit array from a file. bitarr is resized and filled.
+// Returns 1 on success, 0 on failure
+char bit_array_load(BIT_ARRAY* bitarr, FILE* f)
+{
+  // Read in number of bits, return 0 if we can't read in
+  bit_index_t num_bits;
+  if(fread(&num_bits, 1, 8, f) != 8) return 0;
+  num_bits = le64_to_cpu((uint8_t*)&num_bits);
+
+  // Resize
+  bit_array_resize_critical(bitarr, num_bits);
+
+  // Have to calculate how many bytes are needed for the file
+  // (Note: this may be different from num_of_words * sizeof(word_t))
+  bit_index_t num_of_bytes = roundup_bits2bytes(bitarr->num_of_bits);
+  if(fread(bitarr->words, 1, num_of_bytes, f) != num_of_bytes) return 0;
+
+  // Fix endianness
+  word_addr_t i;
+  for(i = 0; i < bitarr->num_of_words; i++)
+    bitarr->words[i] = le64_to_cpu((uint8_t*)&bitarr->words[i]);
+
+  // Mask top word
+  _mask_top_word(bitarr);
+  DEBUG_VALIDATE(bitarr);
+  return 1;
+}
+
+//
+// Hash function
+//
+
+/* From: lookup3.c, by Bob Jenkins, May 2006, Public Domain. */
+#define hashsize(n) ((uint32_t)1<<(n))
+#define hashmask(n) (hashsize(n)-1)
+#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+
+/* From: lookup3.c, by Bob Jenkins, May 2006, Public Domain. */
+#define mix(a,b,c) \
+{ \
+  a -= c;  a ^= rot(c, 4);  c += b; \
+  b -= a;  b ^= rot(a, 6);  a += c; \
+  c -= b;  c ^= rot(b, 8);  b += a; \
+  a -= c;  a ^= rot(c,16);  c += b; \
+  b -= a;  b ^= rot(a,19);  a += c; \
+  c -= b;  c ^= rot(b, 4);  b += a; \
+}
+
+/* From: lookup3.c, by Bob Jenkins, May 2006, Public Domain. */
+#define final(a,b,c) \
+{ \
+  c ^= b; c -= rot(b,14); \
+  a ^= c; a -= rot(c,11); \
+  b ^= a; b -= rot(a,25); \
+  c ^= b; c -= rot(b,16); \
+  a ^= c; a -= rot(c,4);  \
+  b ^= a; b -= rot(a,14); \
+  c ^= b; c -= rot(b,24); \
+}
+
+/*
+From: lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+--------------------------------------------------------------------
+hashword2() -- same as hashword(), but take two seeds and return two
+32-bit values.  pc and pb must both be nonnull, and *pc and *pb must
+both be initialized with seeds.  If you pass in (*pb)==0, the output
+(*pc) will be the same as the return value from hashword().
+--------------------------------------------------------------------
+*/
+static void hashword2 (
+const uint32_t *k,                   /* the key, an array of uint32_t values */
+size_t          length,               /* the length of the key, in uint32_ts */
+uint32_t       *pc,                      /* IN: seed OUT: primary hash value */
+uint32_t       *pb)               /* IN: more seed OUT: secondary hash value */
+{
+  uint32_t a,b,c;
+
+  /* Set up the internal state */
+  a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc;
+  c += *pb;
+
+  /*------------------------------------------------- handle most of the key */
+  while (length > 3)
+  {
+    a += k[0];
+    b += k[1];
+    c += k[2];
+    mix(a,b,c);
+    length -= 3;
+    k += 3;
+  }
+
+  /*------------------------------------------- handle the last 3 uint32_t's */
+  switch(length)                     /* all the case statements fall through */
+  {
+  case 3 : c+=k[2];
+  case 2 : b+=k[1];
+  case 1 : a+=k[0];
+    final(a,b,c);
+  case 0:     /* case 0: nothing left to add */
+    break;
+  }
+  /*------------------------------------------------------ report the result */
+  *pc=c; *pb=b;
+}
+
+// Pass seed as 0 on first call, pass previous hash value if rehashing due
+// to a collision
+// Using bob jenkins hash lookup3
+uint64_t bit_array_hash(const BIT_ARRAY* bitarr, uint64_t seed)
+{
+  uint32_t seed32[2];
+  memcpy(seed32, &seed, sizeof(uint32_t)*2);
+
+  // Round up length to number 32bit words
+  hashword2((uint32_t*)bitarr->words, (bitarr->num_of_bits + 31) / 32,
+            &seed32[0], &seed32[1]);
+
+  // XOR with array length. This ensures arrays with different length but same
+  // contents have different hash values
+  seed ^= bitarr->num_of_bits;
+
+  return seed;
+}
+
+
+//
+// Generally useful functions
+//
+
+// Generalised 'binary to string' function
+// Adds bits to the string in order of lsb to msb
+// e.g. 0b11010 (26 in decimal) would come out as "01011"
+char* bit_array_word2str(const void *ptr, size_t num_of_bits, char *str)
+{
+  const uint8_t* d = (const uint8_t*)ptr;
+
+  size_t i;
+  for(i = 0; i < num_of_bits; i++)
+  {
+    uint8_t bit = (d[i/8] >> (i % 8)) & 0x1;
+    str[i] = bit ? '1' : '0';
+  }
+  str[num_of_bits] = '\0';
+  return str;
+}
+
+char* bit_array_word2str_rev(const void *ptr, size_t num_of_bits, char *str)
+{
+  const uint8_t* d = (const uint8_t*)ptr;
+
+  size_t i;
+  for(i = 0; i < num_of_bits; i++)
+  {
+    uint8_t bit = (d[i/8] >> (i % 8)) & 0x1;
+    str[num_of_bits-1-i] = bit ? '1' : '0';
+  }
+  str[num_of_bits] = '\0';
+  return str;
+}

+ 552 - 0
bit_array.h

@@ -0,0 +1,552 @@
+/*
+ bit_array.h
+ project: bit array C library
+ url: https://github.com/noporpoise/BitArray/
+ maintainer: Isaac Turner <turner.isaac@gmail.com>
+ license: Public Domain, no warranty
+ date: Sep 2014
+*/
+
+#ifndef BIT_ARRAY_HEADER_SEEN
+#define BIT_ARRAY_HEADER_SEEN
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "bit_macros.h"
+
+typedef struct BIT_ARRAY BIT_ARRAY;
+
+// 64 bit words
+typedef uint64_t word_t, word_addr_t, bit_index_t;
+typedef uint8_t word_offset_t; // Offset within a 64 bit word
+
+#define BIT_INDEX_MIN 0
+#define BIT_INDEX_MAX (~(bit_index_t)0)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// Structs
+//
+
+struct BIT_ARRAY
+{
+  word_t* words;
+  bit_index_t num_of_bits;
+  // Number of words used -- this is just round_up(num_of_bits / 64)
+  // if num_of_bits == 0, this is 0
+  word_addr_t num_of_words;
+  // For more efficient allocation we use realloc only to double size --
+  // not for adding every word.  Initial size is INIT_CAPACITY_WORDS.
+  word_addr_t capacity_in_words;
+};
+
+//
+// Basics: Constructor, destructor, get length, resize
+//
+
+// Constructor - create a new bit array of length nbits
+BIT_ARRAY* bit_array_create(bit_index_t nbits);
+
+// Destructor - free the memory used for a bit array
+void bit_array_free(BIT_ARRAY* bitarray);
+
+// Allocate using existing struct
+BIT_ARRAY* bit_array_alloc(BIT_ARRAY* bitarr, bit_index_t nbits);
+void bit_array_dealloc(BIT_ARRAY* bitarr);
+
+// Get length of bit array
+bit_index_t bit_array_length(const BIT_ARRAY* bit_arr);
+
+// Change the size of a bit array. Enlarging an array will add zeros
+// to the end of it. Returns 1 on success, 0 on failure (e.g. not enough memory)
+char bit_array_resize(BIT_ARRAY* bitarr, bit_index_t new_num_of_bits);
+
+// If bitarr length < num_bits, resizes to num_bits
+char bit_array_ensure_size(BIT_ARRAY* bitarr, bit_index_t ensure_num_of_bits);
+
+// Same as above but exit with an error message if out of memory
+void bit_array_resize_critical(BIT_ARRAY* bitarr, bit_index_t num_of_bits);
+void bit_array_ensure_size_critical(BIT_ARRAY* bitarr, bit_index_t num_of_bits);
+
+
+//
+// Macros
+//
+
+//
+// Get, set, clear, assign and toggle individual bits
+// Macros for fast access -- beware: no bounds checking
+//
+
+#define bit_array_get(arr,i)      bitset_get((arr)->words, i)
+#define bit_array_set(arr,i)      bitset_set((arr)->words, i)
+#define bit_array_clear(arr,i)    bitset_del((arr)->words, i)
+#define bit_array_toggle(arr,i)   bitset_tgl((arr)->words, i)
+// c must be 0 or 1
+#define bit_array_assign(arr,i,c) bitset_cpy((arr)->words,i,c)
+
+//
+// Get, set, clear, assign and toggle individual bits
+// "Safe": use assert() to check bounds
+//
+
+// Get the value of a bit (returns 0 or 1)
+char bit_array_get_bit(const BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_set_bit(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_clear_bit(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_toggle_bit(BIT_ARRAY* bitarr, bit_index_t b);
+// If char c != 0, set bit; otherwise clear bit
+void bit_array_assign_bit(BIT_ARRAY* bitarr, bit_index_t b, char c);
+
+//
+// "Resizing": enlarge array if needed
+//
+
+char bit_array_rget(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_rset(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_rclear(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_rtoggle(BIT_ARRAY* bitarr, bit_index_t b);
+void bit_array_rassign(BIT_ARRAY* bitarr, bit_index_t b, char c);
+
+//
+// Set, clear and toggle several bits at once
+//
+
+// Set multiple bits at once.
+// e.g. set bits 1, 20 & 31: bit_array_set_bits(bitarr, 3, 1,20,31);
+// Note: variable args are of type unsigned int
+void bit_array_set_bits(BIT_ARRAY* bitarr, size_t n, ...);
+
+// Clear multiple bits at once.
+// e.g. clear bits 1, 20 & 31: bit_array_clear_bits(bitarr, 3, 1,20,31);
+// Note: variable args are of type unsigned int
+void bit_array_clear_bits(BIT_ARRAY* bitarr, size_t n, ...);
+
+// Toggle multiple bits at once
+// e.g. toggle bits 1, 20 & 31: bit_array_toggle_bits(bitarr, 3, 1,20,31);
+// Note: variable args are of type unsigned int
+void bit_array_toggle_bits(BIT_ARRAY* bitarr, size_t n, ...);
+
+//
+// Set, clear and toggle all bits in a region
+//
+
+// Set all the bits in a region
+void bit_array_set_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len);
+
+// Clear all the bits in a region
+void bit_array_clear_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len);
+
+// Toggle all the bits in a region
+void bit_array_toggle_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len);
+
+//
+// Set, clear and toggle all bits at once
+//
+
+// Set all bits in this array to 1
+void bit_array_set_all(BIT_ARRAY* bitarr);
+
+// Set all bits in this array to 0
+void bit_array_clear_all(BIT_ARRAY* bitarr);
+
+// Set all 1 bits to 0, and all 0 bits to 1
+void bit_array_toggle_all(BIT_ARRAY* bitarr);
+
+//
+// Get / set a word of a given size
+//
+
+// First bit is in the least significant bit position
+// start index must be within the range of the bit array (0 <= x < length)
+uint64_t bit_array_get_word64(const BIT_ARRAY* bitarr, bit_index_t start);
+uint32_t bit_array_get_word32(const BIT_ARRAY* bitarr, bit_index_t start);
+uint16_t bit_array_get_word16(const BIT_ARRAY* bitarr, bit_index_t start);
+uint8_t  bit_array_get_word8(const BIT_ARRAY* bitarr, bit_index_t start);
+uint64_t bit_array_get_wordn(const BIT_ARRAY* bitarr, bit_index_t start, int n);
+
+// Set 64 bits at once from a particular start position
+void bit_array_set_word64(BIT_ARRAY* bitarr, bit_index_t start, uint64_t word);
+void bit_array_set_word32(BIT_ARRAY* bitarr, bit_index_t start, uint32_t word);
+void bit_array_set_word16(BIT_ARRAY* bitarr, bit_index_t start, uint16_t word);
+void bit_array_set_word8(BIT_ARRAY* bitarr, bit_index_t start, uint8_t byte);
+void bit_array_set_wordn(BIT_ARRAY* bitarr, bit_index_t start, uint64_t word, int n);
+
+//
+// Number of bits set
+//
+
+// Get the number of bits set (hamming weight)
+bit_index_t bit_array_num_bits_set(const BIT_ARRAY* bitarr);
+
+// Get the number of bits not set (length - hamming weight)
+bit_index_t bit_array_num_bits_cleared(const BIT_ARRAY* bitarr);
+
+// Get the number of bits set in on array and not the other.  This is equivalent
+// to hamming weight of the XOR when the two arrays are the same length.
+// e.g. 10101 vs 00111 => hamming distance 2 (XOR is 10010)
+bit_index_t bit_array_hamming_distance(const BIT_ARRAY* arr1,
+                                       const BIT_ARRAY* arr2);
+
+// Parity - returns 1 if odd number of bits set, 0 if even
+char bit_array_parity(const BIT_ARRAY* bitarr);
+
+//
+// Find indices of set/clear bits
+//
+
+// Find the index of the next bit that is set, at or after `offset`
+// Returns 1 if a bit is set, otherwise 0
+// Index of next set bit is stored in the integer pointed to by result
+// If no next bit is set result is not changed
+char bit_array_find_next_set_bit(const BIT_ARRAY* bitarr, bit_index_t offset,
+                                 bit_index_t* result);
+
+// Find the index of the next bit that is NOT set, at or after `offset`
+// Returns 1 if a bit is NOT set, otherwise 0
+// Index of next zero bit is stored in the integer pointed to by `result`
+// If no next bit is zero, value at `result` is not changed
+char bit_array_find_next_clear_bit(const BIT_ARRAY* bitarr, bit_index_t offset,
+                                 bit_index_t* result);
+
+// Find the index of the previous bit that is set, before offset.
+// Returns 1 if a bit is set, otherwise 0
+// Index of previous set bit is stored in the integer pointed to by `result`
+// If no previous bit is set result is not changed
+char bit_array_find_prev_set_bit(const BIT_ARRAY* bitarr, bit_index_t offset,
+                                 bit_index_t* result);
+
+// Find the index of the previous bit that is NOT set, before offset.
+// Returns 1 if a bit is clear, otherwise 0
+// Index of previous zero bit is stored in the integer pointed to by `result`
+// If no previous bit is zero result is not changed
+char bit_array_find_prev_clear_bit(const BIT_ARRAY* bitarr, bit_index_t offset,
+                                   bit_index_t* result);
+
+// Find the index of the first bit that is set.
+// Returns 1 if a bit is set, otherwise 0
+// Index of first set bit is stored in the integer pointed to by `result`
+// If no bit is set result is not changed
+char bit_array_find_first_set_bit(const BIT_ARRAY* bitarr, bit_index_t* result);
+
+// Find the index of the first bit that is NOT set.
+// Returns 1 if a bit is clear, otherwise 0
+// Index of first zero bit is stored in the integer pointed to by `result`
+// If no bit is zero result is not changed
+char bit_array_find_first_clear_bit(const BIT_ARRAY* bitarr, bit_index_t* result);
+
+// Find the index of the last bit that is set.
+// Returns 1 if a bit is set, otherwise 0
+// Index of last set bit is stored in the integer pointed to by `result`
+// If no bit is set result is not changed
+char bit_array_find_last_set_bit(const BIT_ARRAY* bitarr, bit_index_t* result);
+
+// Find the index of the last bit that is NOT set.
+// Returns 1 if a bit is clear, otherwise 0
+// Index of last zero bit is stored in the integer pointed to by `result`
+// If no bit is zero result is not changed
+char bit_array_find_last_clear_bit(const BIT_ARRAY* bitarr, bit_index_t* result);
+
+
+//
+// Sorting
+//
+
+// Put all the 0s before all the 1s
+void bit_array_sort_bits(BIT_ARRAY* bitarr);
+
+// Put all the 1s before all the 0s
+void bit_array_sort_bits_rev(BIT_ARRAY* bitarr);
+
+
+//
+// String and printing methods
+//
+
+// Construct a BIT_ARRAY from a string.
+void bit_array_from_str(BIT_ARRAY* bitarr, const char* bitstr);
+
+// Construct a BIT_ARRAY from a substring with given on and off characters.
+void bit_array_from_substr(BIT_ARRAY* bitarr, bit_index_t offset,
+                           const char* str, size_t len,
+                           const char *on, const char *off, char left_to_right);
+
+// Takes a char array to write to.  `str` must be bitarr->num_of_bits+1 in
+// length. Terminates string with '\0'
+char* bit_array_to_str(const BIT_ARRAY* bitarr, char* str);
+char* bit_array_to_str_rev(const BIT_ARRAY* bitarr, char* str);
+
+// Get a string representations for a given region, using given on/off
+// characters.
+// Note: does not null-terminate
+void bit_array_to_substr(const BIT_ARRAY* bitarr,
+                         bit_index_t start, bit_index_t length,
+                         char* str, char on, char off, char left_to_right);
+
+// Print this array to a file stream.  Prints '0's and '1'.  Doesn't print
+// newline.
+void bit_array_print(const BIT_ARRAY* bitarr, FILE* fout);
+
+// Print a string representations for a given region, using given on/off
+// characters. Reverse prints from highest to lowest -- this is useful for
+// printing binary numbers
+void bit_array_print_substr(const BIT_ARRAY* bitarr,
+                            bit_index_t start, bit_index_t length,
+                            FILE* fout, char on, char off, char left_to_right);
+
+//
+// Decimal
+//
+
+// Get bit array as decimal str (e.g. 0b1101 -> "13")
+size_t bit_array_to_decimal(const BIT_ARRAY *bitarr, char *str, size_t len);
+
+// Return number of characters used
+size_t bit_array_from_decimal(BIT_ARRAY *bitarr, const char* decimal);
+
+//
+// Hexidecimal
+//
+
+// Loads array from hex string
+// Returns the number of bits loaded (will be chars rounded up to multiple of 8)
+// (0 on failure)
+bit_index_t bit_array_from_hex(BIT_ARRAY* bitarr, bit_index_t offset,
+                               const char* str, size_t len);
+
+// Returns number of characters written
+size_t bit_array_to_hex(const BIT_ARRAY* bitarr,
+                        bit_index_t start, bit_index_t length,
+                        char* str, char uppercase);
+
+// Print bit array as hex
+size_t bit_array_print_hex(const BIT_ARRAY* bitarr,
+                           bit_index_t start, bit_index_t length,
+                           FILE* fout, char uppercase);
+
+//
+// Clone and copy
+//
+
+// Copy a BIT_ARRAY struct and the data it holds - returns pointer to new object
+#define bit_array_dup	bit_array_clone
+BIT_ARRAY* bit_array_clone(const BIT_ARRAY* bitarr);
+
+// Copy bits from one array to another
+// Note: use MACRO bit_array_copy
+// Destination and source can be the same bit_array and
+// src/dst regions can overlap
+void bit_array_copy(BIT_ARRAY* dst, bit_index_t dstindx,
+                    const BIT_ARRAY* src, bit_index_t srcindx,
+                    bit_index_t length);
+
+// copy all of src to dst. dst is resized to match src.
+void bit_array_copy_all(BIT_ARRAY* dst, const BIT_ARRAY* src);
+
+//
+// Logic operators
+//
+
+// BIT_ARRAYs can all be different or the same object
+// dest array will be resized if it is too short
+//
+void bit_array_and(BIT_ARRAY* dest, const BIT_ARRAY* src1, const BIT_ARRAY* src2);
+void bit_array_or (BIT_ARRAY* dest, const BIT_ARRAY* src1, const BIT_ARRAY* src2);
+void bit_array_xor(BIT_ARRAY* dest, const BIT_ARRAY* src1, const BIT_ARRAY* src2);
+void bit_array_not(BIT_ARRAY* dest, const BIT_ARRAY* src);
+
+//
+// Comparisons
+//
+
+// Note: (bit_array_cmp(a,b) == 0) <=> (bit_array_cmp_big_endian(a,b) == 0)
+
+// comparison functions return:
+//   1 iff bitarr1 > bitarr2
+//   0 iff bitarr1 == bitarr2
+//  -1 iff bitarr1 < bitarr2
+
+// Compare two bit arrays by value stored, with index 0 being the Least
+// Significant Bit (LSB). Arrays do not have to be the same length.
+// Example: ..0101 (5) > ...0011 (3) [index 0 is LSB at right hand side]
+int bit_array_cmp(const BIT_ARRAY* bitarr1, const BIT_ARRAY* bitarr2);
+
+// Compare two bit arrays by value stored, with index 0 being the Most
+// Significant Bit (MSB). Arrays do not have to be the same length.
+// Example: 10.. > 01.. [index 0 is MSB at left hand side]
+int bit_array_cmp_big_endian(const BIT_ARRAY* bitarr1, const BIT_ARRAY* bitarr2);
+
+// compare bitarr with (bitarr2 << pos)
+int bit_array_cmp_words(const BIT_ARRAY *bitarr,
+                        bit_index_t pos, const BIT_ARRAY *bitarr2);
+
+//
+// Shift, interleave, reverse
+//
+
+// Shift array left/right.  If fill is zero, filled with 0, otherwise 1
+void bit_array_shift_right(BIT_ARRAY* bitarr, bit_index_t shift_dist, char fill);
+void bit_array_shift_left (BIT_ARRAY* bitarr, bit_index_t shift_dist, char fill);
+
+// shift left without losing any bits. Resizes bitarr.
+void bit_array_shift_left_extend(BIT_ARRAY* bitarr, bit_index_t shift_dist,
+                                 char fill);
+
+// Cyclic shift
+void bit_array_cycle_right(BIT_ARRAY* bitarr, bit_index_t dist);
+void bit_array_cycle_left (BIT_ARRAY* bitarr, bit_index_t dist);
+
+// Interleave
+// dst cannot point to the same bit array as src1 or src2
+// src1, src2 may point to the same bit array
+// abcd 1234 -> a1b2c3d4
+// 0011 0000 -> 00001010
+// 1111 0000 -> 10101010
+// 0101 1010 -> 01100110
+// Extends dst if it is too short, but does not shrink it if it is too long
+// if dst is longer than length(src1)+length(src2), the end bits are not altered
+void bit_array_interleave(BIT_ARRAY* dst,
+                          const BIT_ARRAY* src1,
+                          const BIT_ARRAY* src2);
+
+// Reverse the whole array or part of it
+void bit_array_reverse(BIT_ARRAY* bitarr);
+void bit_array_reverse_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len);
+
+//
+// Numeric
+//
+
+// Returns 1 on sucess, 0 if value in array is too big
+char bit_array_as_num(const BIT_ARRAY* bitarr, uint64_t* result);
+
+// 1 iff bitarr > value
+// 0 iff bitarr == value
+// -1 iff bitarr < value
+int bit_array_cmp_uint64(const BIT_ARRAY* bitarr, uint64_t value);
+
+//
+// Arithmetic
+//
+
+// bitarr will be extended if needed
+void bit_array_add_uint64(BIT_ARRAY* bitarr, uint64_t value);
+
+// Add `add` to `bitarr` at `pos` -- same as:
+//   bitarr + (add << pos)
+// where pos can be bigger than the length of the array (bitarr will be resized)
+void bit_array_add_word(BIT_ARRAY *bitarr, bit_index_t pos, uint64_t add);
+
+// Add `add` to `bitarr` at `pos`
+void bit_array_add_words(BIT_ARRAY *bitarr, bit_index_t pos, const BIT_ARRAY *add);
+
+// If value is greater than bitarr, bitarr is not changed and 0 is returned
+// Returns 1 on success, 0 if value > bitarr
+char bit_array_sub_uint64(BIT_ARRAY* bitarr, uint64_t value);
+
+// minus `minus` from `bitarr` at `pos` -- same as:
+//   bitarr + (minus << pos)
+// Returns 1 on success, 0 if value > bitarr
+char bit_array_sub_word(BIT_ARRAY *bitarr, bit_index_t pos, word_t minus);
+
+// minus `minus` from `bitarr` at `pos`
+// Returns 1 on success, 0 if value > bitarr
+char bit_array_sub_words(BIT_ARRAY* bitarr, bit_index_t pos, BIT_ARRAY* minus);
+
+// Multiply by some value
+void bit_array_mul_uint64(BIT_ARRAY *bitarr, uint64_t multiplier);
+
+// bitarr = round_down(bitarr / divisor)
+// rem = bitarr % divisor
+void bit_array_div_uint64(BIT_ARRAY *bitarr, uint64_t divisor, uint64_t *rem);
+
+//
+// Arithmetic between arrays
+//
+
+// dst = src1 + src2
+// src1, src2 and dst can all be the same BIT_ARRAY
+// If dst is shorter than either of src1, src2, it is enlarged
+void bit_array_add(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2);
+
+// dst = src1 - src2
+// src1, src2 and dst can all be the same BIT_ARRAY
+// If dst is shorter than src1, it will be extended to be as long as src1
+// src1 must be greater than or equal to src2 (src1 >= src2)
+void bit_array_subtract(BIT_ARRAY* dst,
+                        const BIT_ARRAY* src1, const BIT_ARRAY* src2);
+
+// dst = src1 * src2
+// Pointers cannot all point to the same BIT_ARRAY
+void bit_array_multiply(BIT_ARRAY *dst, BIT_ARRAY *src1, BIT_ARRAY *src2);
+
+// Results in:
+//   quotient = dividend / divisor
+//   dividend = dividend % divisor
+// (dividend is used to return the remainder)
+void bit_array_divide(BIT_ARRAY *dividend, BIT_ARRAY *quotient, BIT_ARRAY *divisor);
+
+//
+// Read/Write bit_array to a file
+//
+// File format is [8 bytes: for number of elements in array][data]
+// Number of bytes of data is: (int)((num_of_bits + 7) / 8)
+//
+
+// Saves bit array to a file
+// returns the number of bytes written
+bit_index_t bit_array_save(const BIT_ARRAY* bitarr, FILE* f);
+
+// Reads bit array from a file. bitarr is resized and filled.
+// Returns 1 on success, 0 on failure
+char bit_array_load(BIT_ARRAY* bitarr, FILE* f);
+
+
+//
+// Hash function
+//
+
+// Pass seed as 0 on first call, pass previous hash value if rehashing due
+// to a collision
+// Using bob jenkins hash lookup3
+uint64_t bit_array_hash(const BIT_ARRAY* bitarr, uint64_t seed);
+
+//
+// Randomness
+//
+
+// Set bits randomly with probability prob : 0 <= prob <= 1
+void bit_array_random(BIT_ARRAY* bitarr, float prob);
+
+// Shuffle the bits in an array randomly
+void bit_array_shuffle(BIT_ARRAY* bitarr);
+
+// Get the next permutation of an array with a fixed size and given number of
+// bits set.  Also known as next lexicographic permutation.
+// Given a bit array find the next lexicographic orginisation of the bits
+// Number of possible combinations given by (size choose bits_set) i.e. nCk
+// 00011 -> 00101 -> 00110 -> 01001 -> 01010 ->
+// 01100 -> 10001 -> 10010 -> 10100 -> 11000 -> 00011 (back to start)
+void bit_array_next_permutation(BIT_ARRAY* bitarr);
+
+//
+// Generally useful functions
+//
+
+// Generalised 'binary to string' function
+// Adds bits to the string in order of lsb to msb
+// e.g. 0b11010 (26 in decimal) would come out as "01011"
+char* bit_array_word2str(const void *ptr, size_t num_of_bits, char *str);
+
+// Same as above but in reverse
+char* bit_array_word2str_rev(const void *ptr, size_t num_of_bits, char *str);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

+ 205 - 0
bit_macros.h

@@ -0,0 +1,205 @@
+/*
+ bit_macros.h
+ project: bit array C library
+ url: https://github.com/noporpoise/BitArray/
+ author: Isaac Turner <turner.isaac@gmail.com>
+ license: Public Domain, no warranty
+ date: Dec 2013
+*/
+
+#ifndef BITSET_H_
+#define BITSET_H_
+
+#include <inttypes.h>
+#include <sched.h>
+
+// trailing_zeros is number of least significant zeros
+// leading_zeros is number of most significant zeros
+#if defined(_WIN32)
+  #define trailing_zeros(x) ({ __typeof(x) _r; _BitScanReverse64(&_r, x); _r; })
+  #define leading_zeros(x) ({ __typeof(x) _r; _BitScanForward64(&_r, x); _r; })
+#else
+  #define trailing_zeros(x) ((x) ? (__typeof(x))__builtin_ctzll(x) : (__typeof(x))sizeof(x)*8)
+  #define leading_zeros(x) ((x) ? (__typeof(x))__builtin_clzll(x) : (__typeof(x))sizeof(x)*8)
+#endif
+
+// Get index of top set bit. If x is 0 return nbits
+#define top_set_bit(x) ((x) ? sizeof(x)*8-leading_zeros(x)-1 : sizeof(x)*8)
+
+#define roundup_bits2bytes(bits)   (((bits)+7)/8)
+#define roundup_bits2words32(bits) (((bits)+31)/32)
+#define roundup_bits2words64(bits) (((bits)+63)/64)
+
+// Round a number up to the nearest number that is a power of two
+#define roundup2pow(x) (1UL << (64 - leading_zeros(x)))
+
+#define rot32(x,r) (((x)<<(r)) | ((x)>>(32-(r))))
+#define rot64(x,r) (((x)<<(r)) | ((x)>>(64-(r))))
+
+// need to check for length == 0, undefined behaviour if uint64_t >> 64 etc
+#define bitmask(nbits,type) ((nbits) ? ~(type)0 >> (sizeof(type)*8-(nbits)): (type)0)
+#define bitmask32(nbits) bitmask(nbits,uint32_t)
+#define bitmask64(nbits) bitmask(nbits,uint64_t)
+
+// A possibly faster way to combine two words with a mask
+//#define bitmask_merge(a,b,abits) ((a & abits) | (b & ~abits))
+#define bitmask_merge(a,b,abits) (b ^ ((a ^ b) & abits))
+
+// Swap lowest four bits. A nibble is 4 bits (i.e. half a byte)
+#define rev_nibble(x) ((((x)&1)<<3)|(((x)&2)<<1)|(((x)&4)>>1)|(((x)&8)>>3))
+
+//
+// Bit array (bitset)
+//
+// bitsetX_wrd(): get word for a given position
+// bitsetX_idx(): get index within word for a given position
+#define _VOLPTR(x) ((volatile __typeof(x) *)(&(x)))
+#define _VOLVALUE(x) (*_VOLPTR(x))
+
+#define _TYPESHIFT(arr,word,shift) \
+        ((__typeof(*(arr)))((__typeof(*(arr)))(word) << (shift)))
+
+#define bitsetX_wrd(wrdbits,pos) ((pos) / (wrdbits))
+#define bitsetX_idx(wrdbits,pos) ((pos) % (wrdbits))
+
+#define bitset32_wrd(pos) ((pos) >> 5)
+#define bitset32_idx(pos) ((pos) & 31)
+
+#define bitset64_wrd(pos) ((pos) >> 6)
+#define bitset64_idx(pos) ((pos) & 63)
+
+//
+// Bit functions on arrays
+//
+#define bitset2_get(arr,wrd,idx)     (((arr)[wrd] >> (idx)) & 0x1)
+#define bitset2_set(arr,wrd,idx)     ((arr)[wrd] |=  _TYPESHIFT(arr,1,idx))
+#define bitset2_del(arr,wrd,idx)     ((arr)[wrd] &=~ _TYPESHIFT(arr,1,idx))
+#define bitset2_tgl(arr,wrd,idx)     ((arr)[wrd] ^=  _TYPESHIFT(arr,1,idx))
+#define bitset2_or(arr,wrd,idx,bit)  ((arr)[wrd] |=  _TYPESHIFT(arr,bit,idx))
+#define bitset2_xor(arr,wrd,idx,bit) ((arr)[wrd]  = ~((arr)[wrd] ^ (~_TYPESHIFT(arr,bit,idx))))
+#define bitset2_and(arr,wrd,idx,bit) ((arr)[wrd] &= (_TYPESHIFT(arr,bit,idx) | ~_TYPESHIFT(arr,1,idx)))
+#define bitset2_cpy(arr,wrd,idx,bit) ((arr)[wrd]  = ((arr)[wrd] &~ _TYPESHIFT(arr,1,idx)) | _TYPESHIFT(arr,bit,idx))
+
+//
+// Thread safe versions
+//
+// They return the value of the bit (0 or 1) before it was updated
+#define bitset2_get_mt(arr,wrd,idx)     bitset2_get(_VOLPTR(*(arr)),wrd,idx)
+#define bitset2_set_mt(arr,wrd,idx)     ((__sync_fetch_and_or (_VOLPTR((arr)[wrd]),  _TYPESHIFT(arr,1,idx)) >> (idx))&1)
+#define bitset2_del_mt(arr,wrd,idx)     ((__sync_fetch_and_and(_VOLPTR((arr)[wrd]), ~_TYPESHIFT(arr,1,idx)) >> (idx))&1)
+#define bitset2_tgl_mt(arr,wrd,idx)     ((__sync_fetch_and_xor(_VOLPTR((arr)[wrd]),  _TYPESHIFT(arr,1,idx)) >> (idx))&1)
+#define bitset2_or_mt(arr,wrd,idx,bit)  ((__sync_fetch_and_or (_VOLPTR((arr)[wrd]),  _TYPESHIFT(arr,bit,idx)) >> (idx))&1)
+#define bitset2_xor_mt(arr,wrd,idx,bit) ((__sync_fetch_and_xor(_VOLPTR((arr)[wrd]),  _TYPESHIFT(arr,bit,idx)) >> (idx))&1)
+#define bitset2_and_mt(arr,wrd,idx,bit) ((__sync_fetch_and_and(_VOLPTR((arr)[wrd]), (_TYPESHIFT(arr,bit,idx) | ~_TYPESHIFT(arr,1,idx))) >> (idx))&1)
+#define bitset2_cpy_mt(arr,wrd,idx,bit) ((bit) ? bitset2_set_mt(arr,wrd,idx) : bitset2_del_mt(arr,wrd,idx))
+
+//
+// Auto detect size of type from pointer
+//
+#define bitset_wrd(arr,pos) bitsetX_wrd(sizeof(*(arr))*8,pos)
+#define bitset_idx(arr,pos) bitsetX_idx(sizeof(*(arr))*8,pos)
+#define bitset_op(func,arr,pos)      func(arr, bitset_wrd(arr,pos), bitset_idx(arr,pos))
+#define bitset_op2(func,arr,pos,bit) func(arr, bitset_wrd(arr,pos), bitset_idx(arr,pos), bit)
+
+// Auto-detect type size: bit functions
+#define bitset_get(arr,pos)     bitset_op(bitset2_get, arr, pos)
+#define bitset_set(arr,pos)     bitset_op(bitset2_set, arr, pos)
+#define bitset_del(arr,pos)     bitset_op(bitset2_del, arr, pos)
+#define bitset_tgl(arr,pos)     bitset_op(bitset2_tgl, arr, pos)
+#define bitset_or(arr,pos,bit)  bitset_op2(bitset2_or, arr, pos, bit)
+#define bitset_xor(arr,pos,bit) bitset_op2(bitset2_xor, arr, pos, bit)
+#define bitset_and(arr,pos,bit) bitset_op2(bitset2_and, arr, pos, bit)
+#define bitset_cpy(arr,pos,bit) bitset_op2(bitset2_cpy, arr, pos, bit)
+
+// Auto-detect type size: thread safe bit functions
+// They return the value of the bit (0 or 1) before it was updated
+#define bitset_get_mt(arr,pos)     bitset_op(bitset2_get_mt,  arr, pos)
+#define bitset_set_mt(arr,pos)     bitset_op(bitset2_set_mt,  arr, pos)
+#define bitset_del_mt(arr,pos)     bitset_op(bitset2_del_mt,  arr, pos)
+#define bitset_tgl_mt(arr,pos)     bitset_op(bitset2_tgl_mt,  arr, pos)
+#define bitset_or_mt(arr,pos,bit)  bitset_op2(bitset2_or_mt,  arr, pos, bit)
+#define bitset_xor_mt(arr,pos,bit) bitset_op2(bitset2_xor_mt, arr, pos, bit)
+#define bitset_and_mt(arr,pos,bit) bitset_op2(bitset2_and_mt, arr, pos, bit)
+#define bitset_cpy_mt(arr,pos,bit) bitset_op2(bitset2_cpy_mt, arr, pos, bit)
+
+// Clearing a word does not return a meaningful value
+#define bitset_clear_word(arr,pos) ((arr)[bitset_wrd(arr,pos)] = 0)
+#define bitset_clear_word_mt(arr,pos) (_VOLVALUE((arr)[bitset_wrd(arr,pos)]) = 0)
+
+//
+// Compact bit array of spin locks
+// These are most effecient when arr is of type: volatile char*
+//
+// Acquire a lock
+#define bitlock_acquire_block(arr,pos,wait,abandon) do {                       \
+  size_t _w = bitset_wrd(arr,pos);                                             \
+  __typeof(*(arr)) _o, _n, _b = _TYPESHIFT(arr, 1, bitset_idx(arr,pos));       \
+  do {                                                                         \
+    while((_o = _VOLVALUE((arr)[_w])) & _b) { wait }                           \
+    abandon                                                                    \
+    _n = _o | _b;                                                              \
+  } while(!__sync_bool_compare_and_swap(_VOLPTR((arr)[_w]), _o, _n));          \
+  __sync_synchronize(); /* Must not move commands to before acquiring lock */  \
+} while(0)
+
+// Undefined behaviour if you do not already hold the lock
+#define bitlock_release(arr,pos) do {                                          \
+  size_t _w = bitset_wrd(arr,pos);                                             \
+  __typeof(*(arr)) _mask = ~_TYPESHIFT(arr, 1, bitset_idx(arr,pos));           \
+  __sync_synchronize(); /* Must get the lock before releasing it */            \
+  __sync_and_and_fetch(_VOLPTR((arr)[_w]), _mask);                             \
+} while(0)
+
+#define bitlock_acquire(arr,pos) bitlock_acquire_block(arr,pos,{},{})
+
+// calls yield if cannot acquire the lock
+#define bitlock_yield_acquire(arr,pos) bitlock_acquire_block(arr,pos,sched_yield();,{})
+
+// Block until we get the lock or someone else does
+// sets the memory pointed to by retptr to 1 if we got the lock, 0 otherwise
+#define bitlock_try_acquire(arr,pos,retptr) do {                               \
+  *retptr = 1; /* default to success, set to zero if locked */                 \
+  bitlock_acquire_block(arr,pos,{*retptr=0;break;},if(!*retptr){break;});      \
+} while(0)
+
+/*
+ * Byteswapping
+ */
+
+/* clang uses these to check for features */
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+/* GCC versions < 4.3 do not have __builtin_bswapX() */
+#if ( defined(__clang__) && !__has_builtin(__builtin_bswap64) ) ||             \
+    ( !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) &&   \
+      ( (__GNUC__ < 4)  || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)) )
+  #define byteswap64(x) ( (((uint64_t)(x) << 56))                       | \
+                          (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
+                          (((uint64_t)(x) << 24) & 0xff0000000000ULL)   | \
+                          (((uint64_t)(x) <<  8) & 0xff00000000ULL)     | \
+                          (((uint64_t)(x) >>  8) & 0xff000000ULL)       | \
+                          (((uint64_t)(x) >> 24) & 0xff0000ULL)         | \
+                          (((uint64_t)(x) >> 40) & 0xff00ULL)           | \
+                          (((uint64_t)(x) >> 56)) )
+
+  #define byteswap32(x) ( (((uint32_t)(x) << 24))                       | \
+                          (((uint32_t)(x) <<  8) & 0xff0000U)           | \
+                          (((uint32_t)(x) >>  8) & 0xff00U)             | \
+                          (((uint32_t)(x) >> 24)) )
+
+  /* uint16_t type might be bigger than 2 bytes, so need to mask */
+  #define byteswap16(x) ( (((uint16_t)(x) & 0xff) << 8) | \
+                          (((uint16_t)(x) >> 8) & 0xff) )
+#else
+  #define byteswap64(x) __builtin_bswap64(x)
+  #define byteswap32(x) __builtin_bswap64(x)
+  #define byteswap16(x) __builtin_bswap64(x)
+#endif
+
+#endif /* BITLOCK_H_ */

+ 4 - 0
cpu.c

@@ -42,6 +42,7 @@ static cpu_t * cpu_new(unsigned int id)
 	new->old_load = 0;
 	new->load = 0;
 	new->irqs = lub_list_new(irq_list_compare);
+	cpus_init(new->cpumask);
 	cpus_clear(new->cpumask);
 	cpu_set(new->id, new->cpumask);
 
@@ -57,6 +58,7 @@ static void cpu_free(cpu_t *cpu)
 		lub_list_node_free(node);
 	}
 	lub_list_free(cpu->irqs);
+	cpus_free(cpu->cpumask);
 	free(cpu);
 }
 
@@ -163,6 +165,7 @@ int scan_cpus(lub_list_t *cpus, int ht)
 	char *str = NULL;
 	size_t sz;
 	cpumask_t thread_siblings;
+	cpus_init(thread_siblings);
 
 	for (id = 0; id < NR_CPUS; id++) {
 		snprintf(path, sizeof(path), "%s/cpu%d", SYSFS_CPU_PATH, id);
@@ -217,6 +220,7 @@ int scan_cpus(lub_list_t *cpus, int ht)
 		new->core_id = core_id;
 		cpu_list_add(cpus, new);
 	}
+	cpus_free(thread_siblings);
 	free(str);
 
 	return 0;

+ 42 - 139
cpumask.h

@@ -82,139 +82,53 @@
  *    inside a macro, the way we do the other calls.
  */
 
+#include <stdlib.h>
 #include "bitmap.h"
+#include "bit_array.h"
+#include "hexio.h"
 
-typedef struct { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
+typedef struct { 
+	BIT_ARRAY* bits;
+} cpumask_t;
 extern cpumask_t _unused_cpumask_arg_;
 
-#define cpu_set(cpu, dst) __cpu_set((cpu), &(dst))
-static inline void __cpu_set(int cpu, volatile cpumask_t *dstp)
-{
-	set_bit(cpu, dstp->bits);
-}
-
-#define cpu_clear(cpu, dst) __cpu_clear((cpu), &(dst))
-static inline void __cpu_clear(int cpu, volatile cpumask_t *dstp)
-{
-	clear_bit(cpu, dstp->bits);
-}
-
-#define cpus_setall(dst) __cpus_setall(&(dst), NR_CPUS)
-static inline void __cpus_setall(cpumask_t *dstp, int nbits)
-{
-	bitmap_fill(dstp->bits, nbits);
-}
-
-#define cpus_clear(dst) __cpus_clear(&(dst), NR_CPUS)
-static inline void __cpus_clear(cpumask_t *dstp, int nbits)
-{
-	bitmap_zero(dstp->bits, nbits);
-}
-
-/* No static inline type checking - see Subtlety (1) above. */
-#define cpu_isset(cpu, cpumask) test_bit((cpu), (cpumask).bits)
-
-#define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS)
-static inline void __cpus_and(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
-}
+#define cpus_init(dst) ((dst).bits = bit_array_create(NR_CPUS))
+#define cpus_free(dst) (bit_array_free((dst).bits))
+#define cpus_copy(dst, src) bit_array_copy_all((dst).bits, (src).bits)
 
-#define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS)
-static inline void __cpus_or(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
-}
+#define cpu_set(cpu, dst) bit_array_set_bit((dst).bits, cpu)
+#define cpu_clear(cpu, dst) bit_array_clear_bit((dst).bits, cpu)
 
-#define cpus_xor(dst, src1, src2) __cpus_xor(&(dst), &(src1), &(src2), NR_CPUS)
-static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
-}
+#define cpus_setall(dst) bit_array_set_all((dst).bits)
+#define cpus_clear(dst) bit_array_clear_all((dst).bits)
 
-#define cpus_andnot(dst, src1, src2) \
-				__cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS)
-static inline void __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
-}
+#define cpu_isset(cpu, cpumask) bit_array_get_bit((cpumask).bits, cpu)
 
-#define cpus_complement(dst, src) __cpus_complement(&(dst), &(src), NR_CPUS)
-static inline void __cpus_complement(cpumask_t *dstp,
-					const cpumask_t *srcp, int nbits)
-{
-	bitmap_complement(dstp->bits, srcp->bits, nbits);
-}
+#define cpus_and(dst, src1, src2) bit_array_and((dst).bits, (src1).bits, (src2).bits)
+#define cpus_or(dst, src1, src2) bit_array_or((dst).bits, (src1).bits, (src2).bits)
+#define cpus_xor(dst, src1, src2) bit_array_xor((dst).bits, (src1).bits, (src2).bits)
+#define cpus_complement(dst, src) bit_array_not((dst).bits, (src).bits)
 
-#define cpus_equal(src1, src2) __cpus_equal(&(src1), &(src2), NR_CPUS)
-static inline int __cpus_equal(const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	return bitmap_equal(src1p->bits, src2p->bits, nbits);
-}
+#define cpus_equal(src1, src2) (bit_array_cmp((src1).bits, (src2).bits) == 0)
+#define cpus_empty(src) (bit_array_num_bits_set((src).bits) == 0)
+#define cpus_full(src) (bit_array_num_bits_cleared((src).bits) == 0)
+#define cpus_weight(cpumask) bit_array_num_bits_set((cpumask).bits)\
 
-#define cpus_intersects(src1, src2) __cpus_intersects(&(src1), &(src2), NR_CPUS)
-static inline int __cpus_intersects(const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
-{
-	return bitmap_intersects(src1p->bits, src2p->bits, nbits);
-}
+#define cpus_shift_right(dst, n) bit_array_shift_right((dst).bits, n, 0)
+#define cpus_shift_left(dst, n) bit_array_shift_left((dst).bits, n, 0)
 
-#define cpus_subset(src1, src2) __cpus_subset(&(src1), &(src2), NR_CPUS)
-static inline int __cpus_subset(const cpumask_t *src1p,
-					const cpumask_t *src2p, int nbits)
+static inline int __first_cpu(const cpumask_t srcp)
 {
-	return bitmap_subset(src1p->bits, src2p->bits, nbits);
+	bit_index_t res = bit_array_length(srcp.bits);
+	bit_array_find_first_set_bit(srcp.bits, &res);
+        return res;
 }
 
-#define cpus_empty(src) __cpus_empty(&(src), NR_CPUS)
-static inline int __cpus_empty(const cpumask_t *srcp, int nbits)
-{
-	return bitmap_empty(srcp->bits, nbits);
-}
-
-#define cpus_full(cpumask) __cpus_full(&(cpumask), NR_CPUS)
-static inline int __cpus_full(const cpumask_t *srcp, int nbits)
-{
-	return bitmap_full(srcp->bits, nbits);
-}
-
-#define cpus_weight(cpumask) __cpus_weight(&(cpumask), NR_CPUS)
-static inline int __cpus_weight(const cpumask_t *srcp, int nbits)
-{
-	return bitmap_weight(srcp->bits, nbits);
-}
-
-#define cpus_shift_right(dst, src, n) \
-			__cpus_shift_right(&(dst), &(src), (n), NR_CPUS)
-static inline void __cpus_shift_right(cpumask_t *dstp,
-					const cpumask_t *srcp, int n, int nbits)
-{
-	bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
-}
-
-#define cpus_shift_left(dst, src, n) \
-			__cpus_shift_left(&(dst), &(src), (n), NR_CPUS)
-static inline void __cpus_shift_left(cpumask_t *dstp,
-					const cpumask_t *srcp, int n, int nbits)
-{
-	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
-}
-
-static inline int __first_cpu(const cpumask_t *srcp)
-{
-        return ffs(*srcp->bits)-1;
-}
-
-#define first_cpu(src) __first_cpu(&(src))
+#define first_cpu(src) __first_cpu((src))
 int __next_cpu(int n, const cpumask_t *srcp);
-#define next_cpu(n, src) __next_cpu((n), &(src))
+#define next_cpu(n, src) __next_cpu((n), (src))
 
-#define cpumask_of_cpu(cpu)						\
+/*#define cpumask_of_cpu(cpu)						\
 ({									\
 	typeof(_unused_cpumask_arg_) m;					\
 	if (sizeof(m) == sizeof(unsigned long)) {			\
@@ -256,23 +170,12 @@ int __next_cpu(int n, const cpumask_t *srcp);
 } }
 
 #define cpus_addr(src) ((src).bits)
+*/
 
-#define cpumask_scnprintf(buf, len, src) \
-			__cpumask_scnprintf((buf), (len), &(src), NR_CPUS)
-static inline int __cpumask_scnprintf(char *buf, int len,
-					const cpumask_t *srcp, int nbits)
-{
-	return bitmap_scnprintf(buf, len, srcp->bits, nbits);
-}
-
-#define cpumask_parse_user(ubuf, ulen, dst) \
-			__cpumask_parse_user((ubuf), (ulen), &(dst), NR_CPUS)
-static inline int __cpumask_parse_user(const char  *buf, int len,
-					cpumask_t *dstp, int nbits)
-{
-	return bitmap_parse(buf, len, dstp->bits, nbits);
-}
+#define cpumask_scnprintf(buf, len, src) bitmask_displayhex((buf), (len), (src).bits)
+#define cpumask_parse_user(ubuf, ulen, dst) bitmask_parsehex((ubuf), (dst).bits)
 
+/*
 #define cpulist_scnprintf(buf, len, src) \
 			__cpulist_scnprintf((buf), (len), &(src), NR_CPUS)
 static inline int __cpulist_scnprintf(char *buf, int len,
@@ -302,16 +205,16 @@ static inline void __cpus_remap(cpumask_t *dstp, const cpumask_t *srcp,
 {
 	bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
 }
-
-#if NR_CPUS > 1
+*/
+/*#if NR_CPUS > 1
 #define for_each_cpu_mask(cpu, mask)		\
 	for ((cpu) = first_cpu(mask);		\
 		(cpu) < NR_CPUS;		\
 		(cpu) = next_cpu((cpu), (mask)))
-#else /* NR_CPUS == 1 */
-#define for_each_cpu_mask(cpu, mask)		\
+#else *//* NR_CPUS == 1 */
+/*#define for_each_cpu_mask(cpu, mask)		\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
-#endif /* NR_CPUS */
+#endif *//* NR_CPUS */
 
 /*
  * The following particular system cpumasks and operations manage
@@ -368,7 +271,7 @@ static inline void __cpus_remap(cpumask_t *dstp, const cpumask_t *srcp,
  *        #define x_map phys_x_map
  *        main(){ set1(3); set2(5); }
  */
-
+/*
 extern cpumask_t cpu_possible_map;
 extern cpumask_t cpu_online_map;
 extern cpumask_t cpu_present_map;
@@ -396,5 +299,5 @@ int __any_online_cpu(const cpumask_t *mask);
 #define for_each_possible_cpu(cpu)  for_each_cpu_mask((cpu), cpu_possible_map)
 #define for_each_online_cpu(cpu)  for_each_cpu_mask((cpu), cpu_online_map)
 #define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map)
-
+*/
 #endif /* __LINUX_CPUMASK_H */

+ 137 - 0
hexio.c

@@ -0,0 +1,137 @@
+/*
+ * Temporaly use displayhex and parsehex from LGPL libbitmask 
+ * Start of libbitmask code
+ */
+
+/*
+ * bitmask user library implementation.
+ *
+ * Copyright (c) 2004-2006 Silicon Graphics, Inc. All rights reserved.
+ *
+ * Paul Jackson <pj@sgi.com>
+ */
+
+/*
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation; either version 2.1 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include "bit_array.h"
+#include "hexio.h"
+
+/*
+ * Write hex word representation of bmp to buf, 32 bits per
+ * comma-separated, zero-filled hex word.  Do not write more
+ * than buflen chars to buf.
+ *
+ * Return number of chars that would have been written
+ * if buf were large enough.
+ */
+
+const char *nexttoken(const char *q,  int sep)
+{
+	if (q)
+		q = strchr(q, sep);
+	if (q)
+		q++;
+	return q;
+}
+
+char _getbit(const BIT_ARRAY* bitarr, bit_index_t b)
+{
+  if (b >= bitarr->num_of_bits)
+  	return 0;
+  return bit_array_get_bit(bitarr, b);
+}
+
+int bitmask_displayhex(char *buf, int buflen, const BIT_ARRAY *bmp)
+{
+	int chunk;
+	int cnt = 0;
+	const char *sep = "";
+
+	if (buflen < 1)
+		return 0;
+	buf[0] = 0;
+
+	for (chunk = howmany(bmp->num_of_bits, HEXCHUNKSZ); chunk >= 0; chunk--) {
+		uint32_t val = 0;
+		int bit;
+
+		for (bit = HEXCHUNKSZ - 1; bit >= 0; bit--)
+			val = val << 1 | _getbit(bmp, chunk * HEXCHUNKSZ + bit);
+		if (val != 0 || cnt != 0 || chunk == 0 )
+		{
+			cnt += snprintf(buf + cnt, max(buflen - cnt, 0), "%s%0*x",
+				sep, HEXCHARSZ, val);
+			sep = ",";
+		}
+	}
+	return cnt;
+}
+
+/*
+ * Parse hex word representation in buf to bmp.
+ *
+ * Returns -1 on error, leaving unspecified results in bmp.
+ */
+
+int bitmask_parsehex(const char *buf, BIT_ARRAY *bmp)
+{
+	const char *p, *q;
+	int nchunks = 0, chunk;
+
+	bit_array_clear_all(bmp);
+
+	q = buf;
+	while (p = q, q = nexttoken(q, ','), p)
+		nchunks++;
+
+	chunk = nchunks - 1;
+	q = buf;
+	while (p = q, q = nexttoken(q, ','), p) {
+		uint32_t val;
+		int bit;
+		char *endptr;
+		int nchars_read, nchars_unread;
+
+		val = strtoul(p, &endptr, 16);
+
+		/* We should have consumed 1 to 8 (HEXCHARSZ) chars */
+		nchars_read = endptr - p;
+		if (nchars_read < 1 || nchars_read > HEXCHARSZ)
+			goto err;
+
+		/* We should have consumed up to next comma */
+		nchars_unread = q - endptr;
+		if (q != NULL && nchars_unread != 1)
+			goto err;
+
+		for (bit = HEXCHUNKSZ - 1; bit >= 0; bit--) {
+			int n = chunk * HEXCHUNKSZ + bit;
+			if (n >= bmp->num_of_bits)
+				goto err;
+			bit_array_assign_bit(bmp, n, (val >> bit) & 1);
+		}
+		chunk--;
+	}
+	return 0;
+err:
+	bit_array_clear_all(bmp);
+	return -1;
+}

+ 17 - 0
hexio.h

@@ -0,0 +1,17 @@
+#include "bit_array.h"
+
+#ifndef HEX_IO_H
+#define HEX_IO_H
+#define HEXCHUNKSZ 32
+#define HEXCHARSZ 8
+#ifndef max
+	#define max(x,y) ((x) > (y) ? (x) : (y))
+#endif
+#ifndef howmany
+	#define howmany(x, y) (((x)+((y)-1))/(y))
+#endif
+
+int bitmask_displayhex(char *buf, int buflen, const BIT_ARRAY *bmp);;
+int bitmask_parsehex(const char *buf, BIT_ARRAY *bmp);
+
+#endif

+ 11 - 1
irq.c

@@ -37,6 +37,8 @@ static irq_t * irq_new(int num)
 	new->intr = 0;
 	new->cpu = NULL;
 	new->weight = 0;
+	cpus_init(new->local_cpus);
+	cpus_init(new->affinity);
 	cpus_setall(new->local_cpus);
 	cpus_clear(new->affinity);
 	new->blacklisted = 0;
@@ -48,6 +50,8 @@ static void irq_free(irq_t *irq)
 {
 	free(irq->type);
 	free(irq->desc);
+	cpus_free(irq->local_cpus);
+	cpus_free(irq->affinity);
 	free(irq);
 }
 
@@ -87,6 +91,8 @@ int irq_list_free(lub_list_t *irqs)
 		irq_t *irq;
 		irq = (irq_t *)lub_list_node__get_data(iter);
 		irq_free(irq);
+		cpus_free(irq->local_cpus);
+		cpus_free(irq->affinity);
 		lub_list_del(irqs, iter);
 		lub_list_node_free(iter);
 	}
@@ -131,8 +137,10 @@ static int parse_local_cpus(lub_list_t *irqs, const char *sysfs_path,
 	char *str = NULL;
 	size_t sz;
 	cpumask_t local_cpus;
+	cpus_init(local_cpus);
 	irq_t *irq = NULL;
 	cpumask_t cpumask;
+	cpus_init(cpumask);
 
 	irq = irq_list_search(irqs, num);
 	if (!irq)
@@ -140,7 +148,7 @@ static int parse_local_cpus(lub_list_t *irqs, const char *sysfs_path,
 
 	/* Find proximity in config file. */
 	if (!pxm_search(pxms, sysfs_path, &cpumask)) {
-		irq->local_cpus = cpumask;
+		cpus_copy(irq->local_cpus, cpumask);
 		return 0;
 	}
 
@@ -156,6 +164,8 @@ static int parse_local_cpus(lub_list_t *irqs, const char *sysfs_path,
 	fclose(fd);
 	cpumask_parse_user(str, strlen(str), local_cpus);
 	cpus_and(irq->local_cpus, irq->local_cpus, local_cpus);
+	cpus_free(local_cpus);
+	cpus_free(cpumask);
 	free(str);
 
 	return 0;

+ 4 - 0
numa.c

@@ -29,6 +29,7 @@ static numa_t * numa_new(unsigned int id)
 	if (!(new = malloc(sizeof(*new))))
 		return NULL;
 	new->id = id;
+	cpus_init(new->cpumap);
 	cpus_setall(new->cpumap);
 
 	return new;
@@ -36,6 +37,7 @@ static numa_t * numa_new(unsigned int id)
 
 static void numa_free(numa_t *numa)
 {
+	cpus_free(numa->cpumap);
 	free(numa);
 }
 
@@ -108,6 +110,7 @@ int scan_numas(lub_list_t *numas)
 	char *str = NULL;
 	size_t sz;
 	cpumask_t cpumap;
+	cpus_init(cpumap);
 
 	for (id = 0; id < NR_NUMA_NODES; id++) {
 		snprintf(path, sizeof(path),
@@ -134,5 +137,6 @@ int scan_numas(lub_list_t *numas)
 	}
 	free(str);
 
+	cpus_free(cpumap);
 	return 0;
 }

+ 4 - 0
pxm.c

@@ -22,6 +22,7 @@ static pxm_t * pxm_new(const char *addr)
 	if (!(new = malloc(sizeof(*new))))
 		return NULL;
 	new->addr = strdup(addr);
+	cpus_init(new->cpumask);
 	cpus_clear(new->cpumask);
 
 	return new;
@@ -33,6 +34,7 @@ static void pxm_free(pxm_t *pxm)
 		return;
 	if (pxm->addr)
 		free(pxm->addr);
+	cpus_free(pxm->cpumask);
 	free(pxm);
 }
 
@@ -129,6 +131,7 @@ int parse_pxm_config(const char *fname, lub_list_t *pxms, lub_list_t *numas)
 		char *pxm_cmd = NULL;
 		char *pxm_pxm = NULL;
 		cpumask_t cpumask;
+		cpus_init(cpumask);
 
 		ln++; /* Next line */
 		if (getline(&line, &size, file) <= 0)
@@ -196,6 +199,7 @@ int parse_pxm_config(const char *fname, lub_list_t *pxms, lub_list_t *numas)
 		cpus_clear(pxm->cpumask);
 		cpus_or(pxm->cpumask, pxm->cpumask, cpumask);
 		pxm_list_add(pxms, pxm);
+		cpus_free(cpumask);
 	}
 
 	fclose(file);