--- jsr166/src/jsr166y/LinkedTransferQueue.java 2009/10/21 16:30:40 1.45 +++ jsr166/src/jsr166y/LinkedTransferQueue.java 2009/10/24 14:33:29 1.51 @@ -105,29 +105,31 @@ public class LinkedTransferQueue exte * successful atomic operation per enq/deq pair. But it also * enables lower cost variants of queue maintenance mechanics. (A * variation of this idea applies even for non-dual queues that - * support deletion of embedded elements, such as + * support deletion of interior elements, such as * j.u.c.ConcurrentLinkedQueue.) * - * Once a node is matched, its item can never again change. We - * may thus arrange that the linked list of them contains a prefix - * of zero or more matched nodes, followed by a suffix of zero or - * more unmatched nodes. (Note that we allow both the prefix and - * suffix to be zero length, which in turn means that we do not - * use a dummy header.) If we were not concerned with either time - * or space efficiency, we could correctly perform enqueue and - * dequeue operations by traversing from a pointer to the initial - * node; CASing the item of the first unmatched node on match and - * CASing the next field of the trailing node on appends. While - * this would be a terrible idea in itself, it does have the - * benefit of not requiring ANY atomic updates on head/tail - * fields. + * Once a node is matched, its match status can never again + * change. We may thus arrange that the linked list of them + * contain a prefix of zero or more matched nodes, followed by a + * suffix of zero or more unmatched nodes. (Note that we allow + * both the prefix and suffix to be zero length, which in turn + * means that we do not use a dummy header.) If we were not + * concerned with either time or space efficiency, we could + * correctly perform enqueue and dequeue operations by traversing + * from a pointer to the initial node; CASing the item of the + * first unmatched node on match and CASing the next field of the + * trailing node on appends. (Plus some special-casing when + * initially empty). While this would be a terrible idea in + * itself, it does have the benefit of not requiring ANY atomic + * updates on head/tail fields. * * We introduce here an approach that lies between the extremes of - * never versus always updating queue (head and tail) pointers - * that reflects the tradeoff of sometimes require extra traversal - * steps to locate the first and/or last unmatched nodes, versus - * the reduced overhead and contention of fewer updates to queue - * pointers. For example, a possible snapshot of a queue is: + * never versus always updating queue (head and tail) pointers. + * This offers a tradeoff between sometimes requiring extra + * traversal steps to locate the first and/or last unmatched + * nodes, versus the reduced overhead and contention of fewer + * updates to queue pointers. For example, a possible snapshot of + * a queue is: * * head tail * | | @@ -139,7 +141,8 @@ public class LinkedTransferQueue exte * similarly for "tail") is an empirical matter. We have found * that using very small constants in the range of 1-3 work best * over a range of platforms. Larger values introduce increasing - * costs of cache misses and risks of long traversal chains. + * costs of cache misses and risks of long traversal chains, while + * smaller values increase CAS contention and overhead. * * Dual queues with slack differ from plain M&S dual queues by * virtue of only sometimes updating head or tail pointers when @@ -158,17 +161,17 @@ public class LinkedTransferQueue exte * targets. Even when using very small slack values, this * approach works well for dual queues because it allows all * operations up to the point of matching or appending an item - * (hence potentially releasing another thread) to be read-only, - * thus not introducing any further contention. As described - * below, we implement this by performing slack maintenance - * retries only after these points. + * (hence potentially allowing progress by another thread) to be + * read-only, thus not introducing any further contention. As + * described below, we implement this by performing slack + * maintenance retries only after these points. * * As an accompaniment to such techniques, traversal overhead can * be further reduced without increasing contention of head - * pointer updates. During traversals, threads may sometimes - * shortcut the "next" link path from the current "head" node to - * be closer to the currently known first unmatched node. Again, - * this may be triggered with using thresholds or randomization. + * pointer updates: Threads may sometimes shortcut the "next" link + * path from the current "head" node to be closer to the currently + * known first unmatched node, and similarly for tail. Again, this + * may be triggered with using thresholds or randomization. * * These ideas must be further extended to avoid unbounded amounts * of costly-to-reclaim garbage caused by the sequential "next" @@ -180,7 +183,7 @@ public class LinkedTransferQueue exte * (Similar issues arise in non-GC environments.) To cope with * this in our implementation, upon CASing to advance the head * pointer, we set the "next" link of the previous head to point - * only to itself; thus limiting the length connected dead lists. + * only to itself; thus limiting the length of connected dead lists. * (We also take similar care to wipe out possibly garbage * retaining values held in other Node fields.) However, doing so * adds some further complexity to traversal: If any "next" @@ -196,47 +199,71 @@ public class LinkedTransferQueue exte * mechanics because an update may leave head at a detached node. * And while direct writes are possible for tail updates, they * increase the risk of long retraversals, and hence long garbage - * chains which can be much more costly than is worthwhile + * chains, which can be much more costly than is worthwhile * considering that the cost difference of performing a CAS vs * write is smaller when they are not triggered on each operation * (especially considering that writes and CASes equally require * additional GC bookkeeping ("write barriers") that are sometimes * more costly than the writes themselves because of contention). * - * Removal of internal nodes (due to timed out or interrupted - * waits, or calls to remove or Iterator.remove) uses a scheme - * roughly similar to that in Scherer, Lea, and Scott - * SynchronousQueue. Given a predecessor, we can unsplice any node - * except the (actual) tail of the queue. To avoid build-up of - * cancelled trailing nodes, upon a request to remove a trailing - * node, it is placed in field "cleanMe" to be unspliced later. + * Removal of interior nodes (due to timed out or interrupted + * waits, or calls to remove(x) or Iterator.remove) can use a + * scheme roughly similar to that described in Scherer, Lea, and + * Scott's SynchronousQueue. Given a predecessor, we can unsplice + * any node except the (actual) tail of the queue. To avoid + * build-up of cancelled trailing nodes, upon a request to remove + * a trailing node, it is placed in field "cleanMe" to be + * unspliced upon the next call to unsplice any other node. + * Situations needing such mechanics are not common but do occur + * in practice; for example when an unbounded series of short + * timed calls to poll repeatedly time out but never otherwise + * fall off the list because of an untimed call to take at the + * front of the queue. Note that maintaining field cleanMe does + * not otherwise much impact garbage retention even if never + * cleared by some other call because the held node will + * eventually either directly or indirectly lead to a self-link + * once off the list. * * *** Overview of implementation *** * - * We use a threshold-based approach to updates, with a target - * slack of two. The slack value is hard-wired: a path greater + * We use a threshold-based approach to updates, with a slack + * threshold of two -- that is, we update head/tail when the + * current pointer appears to be two or more steps away from the + * first/last node. The slack value is hard-wired: a path greater * than one is naturally implemented by checking equality of * traversal pointers except when the list has only one element, - * in which case we keep max slack at one. Avoiding tracking - * explicit counts across situations slightly simplifies an + * in which case we keep slack threshold at one. Avoiding tracking + * explicit counts across method calls slightly simplifies an * already-messy implementation. Using randomization would * probably work better if there were a low-quality dirt-cheap * per-thread one available, but even ThreadLocalRandom is too * heavy for these purposes. * - * With such a small slack value, path short-circuiting is rarely - * worthwhile. However, it is used (in awaitMatch) immediately - * before a waiting thread starts to block, as a final bit of - * helping at a point when contention with others is extremely - * unlikely (since if other threads that could release it are - * operating, then the current thread wouldn't be blocking). + * With such a small slack threshold value, it is rarely + * worthwhile to augment this with path short-circuiting; i.e., + * unsplicing nodes between head and the first unmatched node, or + * similarly for tail, rather than advancing head or tail + * proper. However, it is used (in awaitMatch) immediately before + * a waiting thread starts to block, as a final bit of helping at + * a point when contention with others is extremely unlikely + * (since if other threads that could release it are operating, + * then the current thread wouldn't be blocking). + * + * We allow both the head and tail fields to be null before any + * nodes are enqueued; initializing upon first append. This + * simplifies some other logic, as well as providing more + * efficient explicit control paths instead of letting JVMs insert + * implicit NullPointerExceptions when they are null. While not + * currently fully implemented, we also leave open the possibility + * of re-nulling these fields when empty (which is complicated to + * arrange, for little benefit.) * * All enqueue/dequeue operations are handled by the single method * "xfer" with parameters indicating whether to act as some form * of offer, put, poll, take, or transfer (each possibly with * timeout). The relative complexity of using one monolithic * method outweighs the code bulk and maintenance problems of - * using nine separate methods. + * using separate methods for each case. * * Operation consists of up to three phases. The first is * implemented within method xfer, the second in tryAppend, and @@ -249,33 +276,36 @@ public class LinkedTransferQueue exte * case matching it and returning, also if necessary updating * head to one past the matched node (or the node itself if the * list has no other unmatched nodes). If the CAS misses, then - * a retry loops until the slack is at most two. Traversals - * also check if the initial head is now off-list, in which - * case they start at the new head. + * a loop retries advancing head by two steps until either + * success or the slack is at most two. By requiring that each + * attempt advances head by two (if applicable), we ensure that + * the slack does not grow without bound. Traversals also check + * if the initial head is now off-list, in which case they + * start at the new head. * * If no candidates are found and the call was untimed * poll/offer, (argument "how" is NOW) return. * * 2. Try to append a new node (method tryAppend) * - * Starting at current tail pointer, try to append a new node - * to the list (or if head was null, establish the first - * node). Nodes can be appended only if their predecessors are - * either already matched or are of the same mode. If we detect - * otherwise, then a new node with opposite mode must have been - * appended during traversal, so must restart at phase 1. The - * traversal and update steps are otherwise similar to phase 1: - * Retrying upon CAS misses and checking for staleness. In - * particular, if a self-link is encountered, then we can - * safely jump to a node on the list by continuing the - * traversal at current head. + * Starting at current tail pointer, find the actual last node + * and try to append a new node (or if head was null, establish + * the first node). Nodes can be appended only if their + * predecessors are either already matched or are of the same + * mode. If we detect otherwise, then a new node with opposite + * mode must have been appended during traversal, so we must + * restart at phase 1. The traversal and update steps are + * otherwise similar to phase 1: Retrying upon CAS misses and + * checking for staleness. In particular, if a self-link is + * encountered, then we can safely jump to a node on the list + * by continuing the traversal at current head. * - * On successful append, if the call was ASYNC, return + * On successful append, if the call was ASYNC, return. * * 3. Await match or cancellation (method awaitMatch) * * Wait for another thread to match node; instead cancelling if - * current thread was interrupted or the wait timed out. On + * the current thread was interrupted or the wait timed out. On * multiprocessors, we use front-of-queue spinning: If a node * appears to be the first unmatched node in the queue, it * spins a bit before blocking. In either case, before blocking @@ -290,15 +320,15 @@ public class LinkedTransferQueue exte * to decide to occasionally perform a Thread.yield. While * yield has underdefined specs, we assume that might it help, * and will not hurt in limiting impact of spinning on busy - * systems. We also use much smaller (1/4) spins for nodes - * that are not known to be front but whose predecessors have - * not blocked -- these "chained" spins avoid artifacts of + * systems. We also use smaller (1/2) spins for nodes that are + * not known to be front but whose predecessors have not + * blocked -- these "chained" spins avoid artifacts of * front-of-queue rules which otherwise lead to alternating * nodes spinning vs blocking. Further, front threads that * represent phase changes (from data to request node or vice * versa) compared to their predecessors receive additional - * spins, reflecting the longer code path lengths necessary to - * release them under contention. + * chained spins, reflecting longer paths typically required to + * unblock threads during phase changes. */ /** True if on multiprocessor */ @@ -306,31 +336,34 @@ public class LinkedTransferQueue exte Runtime.getRuntime().availableProcessors() > 1; /** - * The number of times to spin (with on average one randomly - * interspersed call to Thread.yield) on multiprocessor before - * blocking when a node is apparently the first waiter in the - * queue. See above for explanation. Must be a power of two. The - * value is empirically derived -- it works pretty well across a - * variety of processors, numbers of CPUs, and OSes. + * The number of times to spin (with randomly interspersed calls + * to Thread.yield) on multiprocessor before blocking when a node + * is apparently the first waiter in the queue. See above for + * explanation. Must be a power of two. The value is empirically + * derived -- it works pretty well across a variety of processors, + * numbers of CPUs, and OSes. */ private static final int FRONT_SPINS = 1 << 7; /** * The number of times to spin before blocking when a node is - * preceded by another node that is apparently spinning. + * preceded by another node that is apparently spinning. Also + * serves as an increment to FRONT_SPINS on phase changes, and as + * base average frequency for yielding during spins. Must be a + * power of two. */ - private static final int CHAINED_SPINS = FRONT_SPINS >>> 2; + private static final int CHAINED_SPINS = FRONT_SPINS >>> 1; /** - * Queue nodes. Uses Object, not E for items to allow forgetting + * Queue nodes. Uses Object, not E, for items to allow forgetting * them after use. Relies heavily on Unsafe mechanics to minimize - * unecessary ordering constraints: Writes that intrinsically + * unnecessary ordering constraints: Writes that intrinsically * precede or follow CASes use simple relaxed forms. Other * cleanups use releasing/lazy writes. */ static final class Node { final boolean isData; // false if this is a request node - volatile Object item; // initially nonnull if isData; CASed to match + volatile Object item; // initially non-null if isData; CASed to match volatile Node next; volatile Thread waiter; // null until waiting @@ -344,8 +377,8 @@ public class LinkedTransferQueue exte } /** - * Create a new node. Uses relaxed write because item can only - * be seen if followed by CAS + * Creates a new node. Uses relaxed write because item can only + * be seen if followed by CAS. */ Node(Object item, boolean isData) { UNSAFE.putObject(this, itemOffset, item); // relaxed write @@ -391,7 +424,7 @@ public class LinkedTransferQueue exte } /** - * Tries to artifically match a data node -- used by remove. + * Tries to artificially match a data node -- used by remove. */ final boolean tryMatchData() { Object x = item; @@ -449,10 +482,10 @@ public class LinkedTransferQueue exte * Implements all queuing methods. See above for explanation. * * @param e the item or null for take - * @param haveData true if this is a put else a take + * @param haveData true if this is a put, else a take * @param how NOW, ASYNC, SYNC, or TIMEOUT * @param nanos timeout in nanosecs, used only if mode is TIMEOUT - * @return an item if matched, else e; + * @return an item if matched, else e * @throws NullPointerException if haveData mode but e is null */ private Object xfer(Object e, boolean haveData, int how, long nanos) { @@ -469,7 +502,7 @@ public class LinkedTransferQueue exte if (isData == haveData) // can't match break; if (p.casItem(item, e)) { // match - Thread w = p.waiter; + LockSupport.unpark(p.waiter); while (p != h) { // update head Node n = p.next; // by 2 unless singleton if (n != null) @@ -482,12 +515,11 @@ public class LinkedTransferQueue exte (p = h.next) == null || !p.isMatched()) break; // unless slack < 2 } - LockSupport.unpark(w); return item; } } Node n = p.next; - p = p != n ? n : (h = head); // Use head if p offlist + p = (p != n) ? n : (h = head); // Use head if p offlist } if (how >= ASYNC) { // No matches available @@ -497,22 +529,23 @@ public class LinkedTransferQueue exte if (pred == null) continue retry; // lost race vs opposite mode if (how >= SYNC) - return awaitMatch(pred, s, e, how, nanos); + return awaitMatch(s, pred, e, how, nanos); } return e; // not waiting } } /** - * Tries to append node s as tail - * @param haveData true if appending in data mode + * Tries to append node s as tail. + * * @param s the node to append + * @param haveData true if appending in data mode * @return null on failure due to losing race with append in * different mode, else s's predecessor, or s itself if no * predecessor */ private Node tryAppend(Node s, boolean haveData) { - for (Node t = tail, p = t;;) { // move p to actual tail and append + for (Node t = tail, p = t;;) { // move p to last node and append Node n, u; // temps for reads of next & tail if (p == null && (p = head) == null) { if (casHead(null, s)) @@ -520,13 +553,13 @@ public class LinkedTransferQueue exte } else if (p.cannotPrecede(haveData)) return null; // lost race vs opposite mode - else if ((n = p.next) != null) // Not tail; keep traversing + else if ((n = p.next) != null) // not last; keep traversing p = p != t && t != (u = tail) ? (t = u) : // stale tail - p != n ? n : null; // restart if off list + (p != n) ? n : null; // restart if off list else if (!p.casNext(null, s)) p = p.next; // re-read on CAS failure else { - if (p != t) { // Update if slack now >= 2 + if (p != t) { // update if slack now >= 2 while ((tail != t || !casTail(t, s)) && (t = tail) != null && (s = t.next) != null && // advance and retry @@ -540,14 +573,16 @@ public class LinkedTransferQueue exte /** * Spins/yields/blocks until node s is matched or caller gives up. * - * @param pred the predecessor of s or s or null if none * @param s the waiting node + * @param pred the predecessor of s, or s itself if it has no + * predecessor, or null if unknown (the null case does not occur + * in any current calls but may in possible future extensions) * @param e the comparison value for checking match * @param how either SYNC or TIMEOUT * @param nanos timeout value * @return matched item, or e if unmatched on interrupt or timeout */ - private Object awaitMatch(Node pred, Node s, Object e, + private Object awaitMatch(Node s, Node pred, Object e, int how, long nanos) { long lastTime = (how == TIMEOUT) ? System.nanoTime() : 0L; Thread w = Thread.currentThread(); @@ -570,14 +605,14 @@ public class LinkedTransferQueue exte if ((spins = spinsFor(pred, s.isData)) > 0) randomYields = ThreadLocalRandom.current(); } - else if (spins > 0) { // spin, occasionally yield - if (randomYields.nextInt(FRONT_SPINS) == 0) - Thread.yield(); - --spins; + else if (spins > 0) { // spin + if (--spins == 0) + shortenHeadPath(); // reduce slack before blocking + else if (randomYields.nextInt(CHAINED_SPINS) == 0) + Thread.yield(); // occasionally yield } else if (s.waiter == null) { - shortenHeadPath(); // reduce slack before blocking - s.waiter = w; // request unpark + s.waiter = w; // request unpark then recheck } else if (how == TIMEOUT) { long now = System.nanoTime(); @@ -587,21 +622,21 @@ public class LinkedTransferQueue exte } else { LockSupport.park(this); + s.waiter = null; spins = -1; // spin if front upon wakeup } } } /** - * Return spin/yield value for a node with given predecessor and + * Returns spin/yield value for a node with given predecessor and * data mode. See above for explanation. */ private static int spinsFor(Node pred, boolean haveData) { if (MP && pred != null) { - boolean predData = pred.isData; - if (predData != haveData) // front and phase change - return FRONT_SPINS + (FRONT_SPINS >>> 1); - if (predData != (pred.item != null)) // probably at front + if (pred.isData != haveData) // phase change + return FRONT_SPINS + CHAINED_SPINS; + if (pred.isMatched()) // probably at front return FRONT_SPINS; if (pred.waiter == null) // pred apparently spinning return CHAINED_SPINS; @@ -633,15 +668,15 @@ public class LinkedTransferQueue exte /* -------------- Traversal methods -------------- */ /** - * Return the first unmatched node of the given mode, or null if + * Returns the first unmatched node of the given mode, or null if * none. Used by methods isEmpty, hasWaitingConsumer. */ private Node firstOfMode(boolean data) { for (Node p = head; p != null; ) { if (!p.isMatched()) - return p.isData == data? p : null; + return (p.isData == data) ? p : null; Node n = p.next; - p = n != p ? n : head; + p = (n != p) ? n : head; } return null; } @@ -657,14 +692,14 @@ public class LinkedTransferQueue exte if (item != p && (item != null) == isData) return isData ? item : null; Node n = p.next; - p = n != p ? n : head; + p = (n != p) ? n : head; } return null; } /** - * Traverse and count nodes of the given mode. - * Used by methds size and getWaitingConsumerCount. + * Traverses and counts unmatched nodes of the given mode. + * Used by methods size and getWaitingConsumerCount. */ private int countOfMode(boolean data) { int count = 0; @@ -711,7 +746,7 @@ public class LinkedTransferQueue exte else if (item == null) break; Node n = p.next; - p = n != p ? n : head; + p = (n != p) ? n : head; } nextNode = null; } @@ -753,8 +788,8 @@ public class LinkedTransferQueue exte s.forgetContents(); // clear unneeded fields /* * At any given time, exactly one node on list cannot be - * deleted -- the last inserted node. To accommodate this, if - * we cannot delete s, we save its predecessor as "cleanMe", + * unlinked -- the last inserted node. To accommodate this, if + * we cannot unlink s, we save its predecessor as "cleanMe", * processing the previously saved version first. Because only * one node in the list can have a null next, at least one of * node s or the node previously saved can always be @@ -762,7 +797,7 @@ public class LinkedTransferQueue exte */ if (pred != null && pred != s) { while (pred.next == s) { - Node oldpred = cleanMe == null? null : reclean(); + Node oldpred = (cleanMe == null) ? null : reclean(); Node n = s.next; if (n != null) { if (n != s) @@ -1124,7 +1159,7 @@ public class LinkedTransferQueue exte } /** - * Save the state to a stream (that is, serialize it). + * Saves the state to a stream (that is, serializes it). * * @serialData All of the elements (each an {@code E}) in * the proper order, followed by a null @@ -1140,8 +1175,8 @@ public class LinkedTransferQueue exte } /** - * Reconstitute the Queue instance from a stream (that is, - * deserialize it). + * Reconstitutes the Queue instance from a stream (that is, + * deserializes it). * * @param s the stream */