--- jsr166/src/jsr166y/LinkedTransferQueue.java 2009/10/22 09:06:38 1.47 +++ jsr166/src/jsr166y/LinkedTransferQueue.java 2009/10/24 14:57:32 1.52 @@ -105,29 +105,31 @@ public class LinkedTransferQueue exte * successful atomic operation per enq/deq pair. But it also * enables lower cost variants of queue maintenance mechanics. (A * variation of this idea applies even for non-dual queues that - * support deletion of embedded elements, such as + * support deletion of interior elements, such as * j.u.c.ConcurrentLinkedQueue.) * - * Once a node is matched, its item can never again change. We - * may thus arrange that the linked list of them contains a prefix - * of zero or more matched nodes, followed by a suffix of zero or - * more unmatched nodes. (Note that we allow both the prefix and - * suffix to be zero length, which in turn means that we do not - * use a dummy header.) If we were not concerned with either time - * or space efficiency, we could correctly perform enqueue and - * dequeue operations by traversing from a pointer to the initial - * node; CASing the item of the first unmatched node on match and - * CASing the next field of the trailing node on appends. While - * this would be a terrible idea in itself, it does have the - * benefit of not requiring ANY atomic updates on head/tail - * fields. + * Once a node is matched, its match status can never again + * change. We may thus arrange that the linked list of them + * contain a prefix of zero or more matched nodes, followed by a + * suffix of zero or more unmatched nodes. (Note that we allow + * both the prefix and suffix to be zero length, which in turn + * means that we do not use a dummy header.) If we were not + * concerned with either time or space efficiency, we could + * correctly perform enqueue and dequeue operations by traversing + * from a pointer to the initial node; CASing the item of the + * first unmatched node on match and CASing the next field of the + * trailing node on appends. (Plus some special-casing when + * initially empty). While this would be a terrible idea in + * itself, it does have the benefit of not requiring ANY atomic + * updates on head/tail fields. * * We introduce here an approach that lies between the extremes of - * never versus always updating queue (head and tail) pointers - * that reflects the tradeoff of sometimes requiring extra traversal - * steps to locate the first and/or last unmatched nodes, versus - * the reduced overhead and contention of fewer updates to queue - * pointers. For example, a possible snapshot of a queue is: + * never versus always updating queue (head and tail) pointers. + * This offers a tradeoff between sometimes requiring extra + * traversal steps to locate the first and/or last unmatched + * nodes, versus the reduced overhead and contention of fewer + * updates to queue pointers. For example, a possible snapshot of + * a queue is: * * head tail * | | @@ -139,7 +141,8 @@ public class LinkedTransferQueue exte * similarly for "tail") is an empirical matter. We have found * that using very small constants in the range of 1-3 work best * over a range of platforms. Larger values introduce increasing - * costs of cache misses and risks of long traversal chains. + * costs of cache misses and risks of long traversal chains, while + * smaller values increase CAS contention and overhead. * * Dual queues with slack differ from plain M&S dual queues by * virtue of only sometimes updating head or tail pointers when @@ -158,17 +161,17 @@ public class LinkedTransferQueue exte * targets. Even when using very small slack values, this * approach works well for dual queues because it allows all * operations up to the point of matching or appending an item - * (hence potentially releasing another thread) to be read-only, - * thus not introducing any further contention. As described - * below, we implement this by performing slack maintenance - * retries only after these points. + * (hence potentially allowing progress by another thread) to be + * read-only, thus not introducing any further contention. As + * described below, we implement this by performing slack + * maintenance retries only after these points. * * As an accompaniment to such techniques, traversal overhead can * be further reduced without increasing contention of head - * pointer updates. During traversals, threads may sometimes - * shortcut the "next" link path from the current "head" node to - * be closer to the currently known first unmatched node. Again, - * this may be triggered with using thresholds or randomization. + * pointer updates: Threads may sometimes shortcut the "next" link + * path from the current "head" node to be closer to the currently + * known first unmatched node, and similarly for tail. Again, this + * may be triggered with using thresholds or randomization. * * These ideas must be further extended to avoid unbounded amounts * of costly-to-reclaim garbage caused by the sequential "next" @@ -196,47 +199,71 @@ public class LinkedTransferQueue exte * mechanics because an update may leave head at a detached node. * And while direct writes are possible for tail updates, they * increase the risk of long retraversals, and hence long garbage - * chains which can be much more costly than is worthwhile + * chains, which can be much more costly than is worthwhile * considering that the cost difference of performing a CAS vs * write is smaller when they are not triggered on each operation * (especially considering that writes and CASes equally require * additional GC bookkeeping ("write barriers") that are sometimes * more costly than the writes themselves because of contention). * - * Removal of internal nodes (due to timed out or interrupted - * waits, or calls to remove or Iterator.remove) uses a scheme - * roughly similar to that in Scherer, Lea, and Scott - * SynchronousQueue. Given a predecessor, we can unsplice any node - * except the (actual) tail of the queue. To avoid build-up of - * cancelled trailing nodes, upon a request to remove a trailing - * node, it is placed in field "cleanMe" to be unspliced later. + * Removal of interior nodes (due to timed out or interrupted + * waits, or calls to remove(x) or Iterator.remove) can use a + * scheme roughly similar to that described in Scherer, Lea, and + * Scott's SynchronousQueue. Given a predecessor, we can unsplice + * any node except the (actual) tail of the queue. To avoid + * build-up of cancelled trailing nodes, upon a request to remove + * a trailing node, it is placed in field "cleanMe" to be + * unspliced upon the next call to unsplice any other node. + * Situations needing such mechanics are not common but do occur + * in practice; for example when an unbounded series of short + * timed calls to poll repeatedly time out but never otherwise + * fall off the list because of an untimed call to take at the + * front of the queue. Note that maintaining field cleanMe does + * not otherwise much impact garbage retention even if never + * cleared by some other call because the held node will + * eventually either directly or indirectly lead to a self-link + * once off the list. * * *** Overview of implementation *** * - * We use a threshold-based approach to updates, with a target - * slack of two. The slack value is hard-wired: a path greater + * We use a threshold-based approach to updates, with a slack + * threshold of two -- that is, we update head/tail when the + * current pointer appears to be two or more steps away from the + * first/last node. The slack value is hard-wired: a path greater * than one is naturally implemented by checking equality of * traversal pointers except when the list has only one element, - * in which case we keep max slack at one. Avoiding tracking - * explicit counts across situations slightly simplifies an + * in which case we keep slack threshold at one. Avoiding tracking + * explicit counts across method calls slightly simplifies an * already-messy implementation. Using randomization would * probably work better if there were a low-quality dirt-cheap * per-thread one available, but even ThreadLocalRandom is too * heavy for these purposes. * - * With such a small slack value, path short-circuiting is rarely - * worthwhile. However, it is used (in awaitMatch) immediately - * before a waiting thread starts to block, as a final bit of - * helping at a point when contention with others is extremely - * unlikely (since if other threads that could release it are - * operating, then the current thread wouldn't be blocking). + * With such a small slack threshold value, it is rarely + * worthwhile to augment this with path short-circuiting; i.e., + * unsplicing nodes between head and the first unmatched node, or + * similarly for tail, rather than advancing head or tail + * proper. However, it is used (in awaitMatch) immediately before + * a waiting thread starts to block, as a final bit of helping at + * a point when contention with others is extremely unlikely + * (since if other threads that could release it are operating, + * then the current thread wouldn't be blocking). + * + * We allow both the head and tail fields to be null before any + * nodes are enqueued; initializing upon first append. This + * simplifies some other logic, as well as providing more + * efficient explicit control paths instead of letting JVMs insert + * implicit NullPointerExceptions when they are null. While not + * currently fully implemented, we also leave open the possibility + * of re-nulling these fields when empty (which is complicated to + * arrange, for little benefit.) * * All enqueue/dequeue operations are handled by the single method * "xfer" with parameters indicating whether to act as some form * of offer, put, poll, take, or transfer (each possibly with * timeout). The relative complexity of using one monolithic * method outweighs the code bulk and maintenance problems of - * using nine separate methods. + * using separate methods for each case. * * Operation consists of up to three phases. The first is * implemented within method xfer, the second in tryAppend, and @@ -249,33 +276,36 @@ public class LinkedTransferQueue exte * case matching it and returning, also if necessary updating * head to one past the matched node (or the node itself if the * list has no other unmatched nodes). If the CAS misses, then - * a retry loops until the slack is at most two. Traversals - * also check if the initial head is now off-list, in which - * case they start at the new head. + * a loop retries advancing head by two steps until either + * success or the slack is at most two. By requiring that each + * attempt advances head by two (if applicable), we ensure that + * the slack does not grow without bound. Traversals also check + * if the initial head is now off-list, in which case they + * start at the new head. * * If no candidates are found and the call was untimed * poll/offer, (argument "how" is NOW) return. * * 2. Try to append a new node (method tryAppend) * - * Starting at current tail pointer, try to append a new node - * to the list (or if head was null, establish the first - * node). Nodes can be appended only if their predecessors are - * either already matched or are of the same mode. If we detect - * otherwise, then a new node with opposite mode must have been - * appended during traversal, so must restart at phase 1. The - * traversal and update steps are otherwise similar to phase 1: - * Retrying upon CAS misses and checking for staleness. In - * particular, if a self-link is encountered, then we can - * safely jump to a node on the list by continuing the - * traversal at current head. + * Starting at current tail pointer, find the actual last node + * and try to append a new node (or if head was null, establish + * the first node). Nodes can be appended only if their + * predecessors are either already matched or are of the same + * mode. If we detect otherwise, then a new node with opposite + * mode must have been appended during traversal, so we must + * restart at phase 1. The traversal and update steps are + * otherwise similar to phase 1: Retrying upon CAS misses and + * checking for staleness. In particular, if a self-link is + * encountered, then we can safely jump to a node on the list + * by continuing the traversal at current head. * * On successful append, if the call was ASYNC, return. * * 3. Await match or cancellation (method awaitMatch) * * Wait for another thread to match node; instead cancelling if - * current thread was interrupted or the wait timed out. On + * the current thread was interrupted or the wait timed out. On * multiprocessors, we use front-of-queue spinning: If a node * appears to be the first unmatched node in the queue, it * spins a bit before blocking. In either case, before blocking @@ -290,15 +320,15 @@ public class LinkedTransferQueue exte * to decide to occasionally perform a Thread.yield. While * yield has underdefined specs, we assume that might it help, * and will not hurt in limiting impact of spinning on busy - * systems. We also use much smaller (1/4) spins for nodes - * that are not known to be front but whose predecessors have - * not blocked -- these "chained" spins avoid artifacts of + * systems. We also use smaller (1/2) spins for nodes that are + * not known to be front but whose predecessors have not + * blocked -- these "chained" spins avoid artifacts of * front-of-queue rules which otherwise lead to alternating * nodes spinning vs blocking. Further, front threads that * represent phase changes (from data to request node or vice * versa) compared to their predecessors receive additional - * spins, reflecting the longer code path lengths necessary to - * release them under contention. + * chained spins, reflecting longer paths typically required to + * unblock threads during phase changes. */ /** True if on multiprocessor */ @@ -306,20 +336,23 @@ public class LinkedTransferQueue exte Runtime.getRuntime().availableProcessors() > 1; /** - * The number of times to spin (with on average one randomly - * interspersed call to Thread.yield) on multiprocessor before - * blocking when a node is apparently the first waiter in the - * queue. See above for explanation. Must be a power of two. The - * value is empirically derived -- it works pretty well across a - * variety of processors, numbers of CPUs, and OSes. + * The number of times to spin (with randomly interspersed calls + * to Thread.yield) on multiprocessor before blocking when a node + * is apparently the first waiter in the queue. See above for + * explanation. Must be a power of two. The value is empirically + * derived -- it works pretty well across a variety of processors, + * numbers of CPUs, and OSes. */ private static final int FRONT_SPINS = 1 << 7; /** * The number of times to spin before blocking when a node is - * preceded by another node that is apparently spinning. + * preceded by another node that is apparently spinning. Also + * serves as an increment to FRONT_SPINS on phase changes, and as + * base average frequency for yielding during spins. Must be a + * power of two. */ - private static final int CHAINED_SPINS = FRONT_SPINS >>> 2; + private static final int CHAINED_SPINS = FRONT_SPINS >>> 1; /** * Queue nodes. Uses Object, not E, for items to allow forgetting @@ -469,20 +502,19 @@ public class LinkedTransferQueue exte if (isData == haveData) // can't match break; if (p.casItem(item, e)) { // match - Thread w = p.waiter; - while (p != h) { // update head - Node n = p.next; // by 2 unless singleton - if (n != null) - p = n; - if (head == h && casHead(h, p)) { + for (Node q = p; q != h;) { + Node n = q.next; // update head by 2 + if (n != null) // unless singleton + q = n; + if (head == h && casHead(h, q)) { h.forgetNext(); break; } // advance and retry if ((h = head) == null || - (p = h.next) == null || !p.isMatched()) + (q = h.next) == null || !q.isMatched()) break; // unless slack < 2 } - LockSupport.unpark(w); + LockSupport.unpark(p.waiter); return item; } } @@ -497,7 +529,7 @@ public class LinkedTransferQueue exte if (pred == null) continue retry; // lost race vs opposite mode if (how >= SYNC) - return awaitMatch(pred, s, e, how, nanos); + return awaitMatch(s, pred, e, how, nanos); } return e; // not waiting } @@ -506,14 +538,14 @@ public class LinkedTransferQueue exte /** * Tries to append node s as tail. * - * @param haveData true if appending in data mode * @param s the node to append + * @param haveData true if appending in data mode * @return null on failure due to losing race with append in * different mode, else s's predecessor, or s itself if no * predecessor */ private Node tryAppend(Node s, boolean haveData) { - for (Node t = tail, p = t;;) { // move p to actual tail and append + for (Node t = tail, p = t;;) { // move p to last node and append Node n, u; // temps for reads of next & tail if (p == null && (p = head) == null) { if (casHead(null, s)) @@ -521,13 +553,13 @@ public class LinkedTransferQueue exte } else if (p.cannotPrecede(haveData)) return null; // lost race vs opposite mode - else if ((n = p.next) != null) // Not tail; keep traversing + else if ((n = p.next) != null) // not last; keep traversing p = p != t && t != (u = tail) ? (t = u) : // stale tail (p != n) ? n : null; // restart if off list else if (!p.casNext(null, s)) p = p.next; // re-read on CAS failure else { - if (p != t) { // Update if slack now >= 2 + if (p != t) { // update if slack now >= 2 while ((tail != t || !casTail(t, s)) && (t = tail) != null && (s = t.next) != null && // advance and retry @@ -541,14 +573,16 @@ public class LinkedTransferQueue exte /** * Spins/yields/blocks until node s is matched or caller gives up. * - * @param pred the predecessor of s or s or null if none * @param s the waiting node + * @param pred the predecessor of s, or s itself if it has no + * predecessor, or null if unknown (the null case does not occur + * in any current calls but may in possible future extensions) * @param e the comparison value for checking match * @param how either SYNC or TIMEOUT * @param nanos timeout value * @return matched item, or e if unmatched on interrupt or timeout */ - private Object awaitMatch(Node pred, Node s, Object e, + private Object awaitMatch(Node s, Node pred, Object e, int how, long nanos) { long lastTime = (how == TIMEOUT) ? System.nanoTime() : 0L; Thread w = Thread.currentThread(); @@ -571,14 +605,14 @@ public class LinkedTransferQueue exte if ((spins = spinsFor(pred, s.isData)) > 0) randomYields = ThreadLocalRandom.current(); } - else if (spins > 0) { // spin, occasionally yield - if (randomYields.nextInt(FRONT_SPINS) == 0) - Thread.yield(); - --spins; + else if (spins > 0) { // spin + if (--spins == 0) + shortenHeadPath(); // reduce slack before blocking + else if (randomYields.nextInt(CHAINED_SPINS) == 0) + Thread.yield(); // occasionally yield } else if (s.waiter == null) { - shortenHeadPath(); // reduce slack before blocking - s.waiter = w; // request unpark + s.waiter = w; // request unpark then recheck } else if (how == TIMEOUT) { long now = System.nanoTime(); @@ -588,6 +622,7 @@ public class LinkedTransferQueue exte } else { LockSupport.park(this); + s.waiter = null; spins = -1; // spin if front upon wakeup } } @@ -599,10 +634,9 @@ public class LinkedTransferQueue exte */ private static int spinsFor(Node pred, boolean haveData) { if (MP && pred != null) { - boolean predData = pred.isData; - if (predData != haveData) // front and phase change - return FRONT_SPINS + (FRONT_SPINS >>> 1); - if (predData != (pred.item != null)) // probably at front + if (pred.isData != haveData) // phase change + return FRONT_SPINS + CHAINED_SPINS; + if (pred.isMatched()) // probably at front return FRONT_SPINS; if (pred.waiter == null) // pred apparently spinning return CHAINED_SPINS; @@ -754,8 +788,8 @@ public class LinkedTransferQueue exte s.forgetContents(); // clear unneeded fields /* * At any given time, exactly one node on list cannot be - * deleted -- the last inserted node. To accommodate this, if - * we cannot delete s, we save its predecessor as "cleanMe", + * unlinked -- the last inserted node. To accommodate this, if + * we cannot unlink s, we save its predecessor as "cleanMe", * processing the previously saved version first. Because only * one node in the list can have a null next, at least one of * node s or the node previously saved can always be