--- jsr166/src/jsr166e/StripedAdder.java 2011/07/24 15:08:21 1.5 +++ jsr166/src/jsr166e/StripedAdder.java 2011/07/30 16:26:34 1.12 @@ -5,7 +5,6 @@ */ package jsr166e; -import java.util.Arrays; import java.util.Random; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; @@ -15,27 +14,33 @@ import java.io.ObjectInputStream; import java.io.ObjectOutputStream; /** - * A set of variables that together maintain a sum. When updates - * (method {@link #add}) are contended across threads, this set of - * adder variables may grow dynamically to reduce contention. Method - * {@link #sum} returns the current combined total across these - * adders. This value is NOT an atomic snapshot (concurrent - * updates may occur while the sum is being calculated), and so cannot - * be used alone for fine-grained synchronization control. + * One or more variables that together maintain an initially zero sum. + * When updates (method {@link #add}) are contended across threads, + * the set of variables may grow dynamically to reduce contention. * - *
This class may be applicable when many threads frequently - * update a common sum that is used for purposes such as collecting - * statistics. In this case, performance may be significantly faster - * than using a shared {@link AtomicLong}, at the expense of using - * much more space. On the other hand, if it is known that only one - * thread can ever update the sum, performance may be significantly - * slower than just updating a local variable. + *
This class is usually preferable to {@link AtomicLong} when + * multiple threads update a common sum that is used for purposes such + * as collecting statistics, not for fine-grained synchronization + * control. Under high update contention, throughput of this class is + * expected to be significantly higher, at the expense of higher space + * consumption. Under low contention, this class imposes very little + * time and space overhead compared to AtomicLong. On the other hand, + * in contexts where it is statically known that only one thread can + * ever update a sum, time and space overhead is noticeably greater + * than just updating a local variable. * - *
A StripedAdder may optionally be constructed with a given - * expected contention level; i.e., the number of threads that are - * expected to concurrently update the sum. Supplying an accurate - * value may improve performance by reducing the need for dynamic - * adjustment. + *
Method {@link #sum} returns the current combined total across
+ * the variables maintaining the sum. This value is NOT an
+ * atomic snapshot: Concurrent updates may occur while the sum is
+ * being calculated. However, updates cannot be "lost", so invocation
+ * of sum
in the absence of concurrent updates always
+ * returns an accurate result. The sum may also be reset
+ * to zero, as an alternative to creating a new adder. However,
+ * method {@link #reset} is intrinsically racy, so should only be used
+ * when it is known that no threads are concurrently updating the sum.
+ *
+ *
jsr166e note: This class is targeted to be placed in
+ * java.util.concurrent.atomic
*
* @author Doug Lea
*/
@@ -43,131 +48,128 @@ public class StripedAdder implements Ser
private static final long serialVersionUID = 7249069246863182397L;
/*
- * A StripedAdder maintains a table of Atomic long variables. The
- * table is indexed by per-thread hash codes that are initialized
- * to random values.
- *
- * The table doubles in size upon contention (as indicated by
- * failed CASes when performing add()), but is capped at the
- * nearest power of two >= #CPUS. This reflects the idea that,
- * when there are more threads than CPUs, then if each thread were
- * bound to a CPU, there would exist a perfect hash function
- * mapping threads to slots that eliminates collisions. When we
- * reach capacity, we search for this mapping by randomly varying
- * the hash codes of colliding threads. Because search is random,
- * and failures only become known via CAS failures, convergence
- * will be slow, and because threads are typically not bound to
- * CPUS forever, may not occur at all. However, despite these
- * limitations, observed contention is typically low in these
- * cases.
+ * A StripedAdder maintains a lazily-initialized table of
+ * atomically updated variables, plus an extra "base" field. The
+ * table size is a power of two. Indexing uses masked per-thread
+ * hash codes
*
- * Table entries are of class Adder; a form of AtomicLong padded
+ * Table entries are of class Cell; a variant of AtomicLong padded
* to reduce cache contention on most processors. Padding is
- * overkill for most Atomics because they are most often
- * irregularly scattered in memory and thus don't interfere much
- * with each other. But Atomic objects residing in arrays will
- * tend to be placed adjacent to each other, and so will most
- * often share cache lines without this precaution. Adders are
- * constructed upon first use, which further improves per-thread
- * locality and helps reduce (an already large) footprint.
+ * overkill for most Atomics because they are usually irregularly
+ * scattered in memory and thus don't interfere much with each
+ * other. But Atomic objects residing in arrays will tend to be
+ * placed adjacent to each other, and so will most often share
+ * cache lines (with a huge negative performance impact) without
+ * this precaution.
+ *
+ * In part because Cells are relatively large, we avoid creating
+ * them until they are needed. When there is no contention, all
+ * updates are made to the base field. Upon first contention (a
+ * failed CAS on base update), the table is initialized to size 2.
+ * The table size is doubled upon further contention until
+ * reaching the nearest power of two greater than or equal to the
+ * number of CPUS.
+ *
+ * Per-thread hash codes are initialized to random values.
+ * Contention and/or table collisions are indicated by failed
+ * CASes when performing an add operation (see method
+ * retryAdd). Upon a collision, if the table size is less than the
+ * capacity, it is doubled in size unless some other thread holds
+ * the lock. If a hashed slot is empty, and lock is available, a
+ * new Cell is created. Otherwise, if the slot exists, a CAS is
+ * tried. Retries proceed by "double hashing", using a secondary
+ * hash (Marsaglia XorShift) to try to find a free slot.
+ *
+ * The table size is capped because, when there are more threads
+ * than CPUs, supposing that each thread were bound to a CPU,
+ * there would exist a perfect hash function mapping threads to
+ * slots that eliminates collisions. When we reach capacity, we
+ * search for this mapping by randomly varying the hash codes of
+ * colliding threads. Because search is random, and collisions
+ * only become known via CAS failures, convergence can be slow,
+ * and because threads are typically not bound to CPUS forever,
+ * may not occur at all. However, despite these limitations,
+ * observed contention rates are typically low in these cases.
+ *
+ * A single spinlock is used for initializing and resizing the
+ * table, as well as populating slots with new Cells. There is no
+ * need for a blocking lock: Upon lock contention, threads try
+ * other slots (or the base) rather than blocking. During these
+ * retries, there is increased contention and reduced locality,
+ * which is still better than alternatives.
+ *
+ * It is possible for a Cell to become unused when threads that
+ * once hashed to it terminate, as well as in the case where
+ * doubling the table causes no thread to hash to it under
+ * expanded mask. We do not try to detect or remove such cells,
+ * under the assumption that for long-running adders, observed
+ * contention levels will recur, so the cells will eventually be
+ * needed again; and for short-lived ones, it does not matter.
*
- * A single spinlock is used for resizing the table as well as
- * populating slots with new Adders. Upon lock contention, threads
- * try other slots rather than blocking. After initialization, at
- * least one slot exists, so retries will eventually find a
- * candidate Adder. During these retries, there is increased
- * contention and reduced locality, which is still better than
- * alternatives.
- */
-
- /**
- * Number of processors, to place a cap on table growth.
*/
- static final int NCPU = Runtime.getRuntime().availableProcessors();
- /**
- * The table size set upon first use when default-constructed
- */
- private static final int DEFAULT_ARRAY_SIZE = 8;
+ private static final int NCPU = Runtime.getRuntime().availableProcessors();
/**
- * Padded version of AtomicLong
+ * Padded variant of AtomicLong. The value field is placed
+ * between pads, hoping that the JVM doesn't reorder them.
+ * Updates are via inlined CAS in methods add and retryAdd.
*/
- static final class Adder extends AtomicLong {
- long p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pa, pb, pc, pd;
- Adder(long x) { super(x); }
+ static final class Cell {
+ volatile long p0, p1, p2, p3, p4, p5, p6;
+ volatile long value;
+ volatile long q0, q1, q2, q3, q4, q5, q6;
+ Cell(long x) { value = x; }
}
/**
- * Holder for the thread-local hash code. The code starts off with
- * a given random value, but may be set to a different value upon
- * collisions in retryAdd.
+ * Holder for the thread-local hash code. The code is initially
+ * random, but may be set to a different value upon collisions.
*/
static final class HashCode {
+ static final Random rng = new Random();
int code;
- HashCode(int h) { code = h; }
+ HashCode() {
+ int h = rng.nextInt(); // Avoid zero to allow xorShift rehash
+ code = (h == 0) ? 1 : h;
+ }
}
/**
* The corresponding ThreadLocal class
*/
static final class ThreadHashCode extends ThreadLocal