6 |
|
*/ |
7 |
|
|
8 |
|
package java.util.concurrent; |
9 |
– |
import java.util.concurrent.locks.*; |
9 |
|
import java.util.concurrent.atomic.*; |
11 |
– |
import java.util.Random; |
10 |
|
|
11 |
|
/** |
12 |
|
* A synchronization point at which threads can pair and swap elements |
13 |
< |
* within pairs. Each thread presents some object on entry to the |
13 |
> |
* within pairs. Each thread presents some object on entry to the |
14 |
|
* {@link #exchange exchange} method, matches with a partner thread, |
15 |
< |
* and receives its partner's object on return. |
15 |
> |
* and receives its partner's object on return. An Exchanger may be |
16 |
> |
* viewed as a bidirectional form of a {@link |
17 |
> |
* SynchronousQueue}. Exchangers may be useful in applications such as |
18 |
> |
* genetic algorithms and pipeline designs. |
19 |
|
* |
20 |
|
* <p><b>Sample Usage:</b> |
21 |
|
* Here are the highlights of a class that uses an {@code Exchanger} |
74 |
|
*/ |
75 |
|
public class Exchanger<V> { |
76 |
|
/* |
77 |
< |
* The underlying idea is to use a stack to hold nodes containing |
77 |
< |
* pairs of items to be exchanged. Except that: |
77 |
> |
* Algorithm Description: |
78 |
|
* |
79 |
< |
* * Only one element of the pair is known on creation by a |
80 |
< |
* first-arriving thread; the other is a "hole" waiting to be |
81 |
< |
* filled in. This is a degenerate form of the dual stacks |
82 |
< |
* described in "Nonblocking Concurrent Objects with Condition |
83 |
< |
* Synchronization", by W. N. Scherer III and M. L. Scott. |
84 |
< |
* 18th Annual Conf. on Distributed Computing, Oct. 2004. |
85 |
< |
* It is "degenerate" in that both the items and the holes |
86 |
< |
* are shared in the same nodes. |
87 |
< |
* |
88 |
< |
* * There isn't really a stack here! There can't be -- if two |
89 |
< |
* nodes were both in the stack, they should cancel themselves |
90 |
< |
* out by combining. So that's what we do. The 0th element of |
91 |
< |
* the "arena" array serves only as the top of stack. The |
92 |
< |
* remainder of the array is a form of the elimination backoff |
93 |
< |
* collision array described in "A Scalable Lock-free Stack |
94 |
< |
* Algorithm", by D. Hendler, N. Shavit, and L. Yerushalmi. |
95 |
< |
* 16th ACM Symposium on Parallelism in Algorithms and |
96 |
< |
* Architectures, June 2004. Here, threads spin (using short |
97 |
< |
* timed waits with exponential backoff) looking for each |
98 |
< |
* other. If they fail to find others waiting, they try the |
99 |
< |
* top spot again. As shown in that paper, this always |
100 |
< |
* converges. |
101 |
< |
* |
102 |
< |
* The backoff elimination mechanics never come into play in |
103 |
< |
* common usages where only two threads ever meet to exchange |
104 |
< |
* items, but they prevent contention bottlenecks when an |
105 |
< |
* exchanger is used by a large number of threads. |
106 |
< |
* |
107 |
< |
* For more details, see the paper "A Scalable Elimination-based |
108 |
< |
* Exchange Channel" by William Scherer, Doug Lea, and Michael |
109 |
< |
* Scott in Proceedings of SCOOL05 workshop. Available at: |
110 |
< |
* http://hdl.handle.net/1802/2104 |
79 |
> |
* The basic idea is to maintain a "slot", which is a reference to |
80 |
> |
* a Node containing both an Item to offer and a "hole" waiting to |
81 |
> |
* get filled in.. If an incoming "occupying" thread sees that the |
82 |
> |
* slot is null, it CAS'es (compareAndSets) a Node there and waits |
83 |
> |
* for another to invoke exchange. That second "fulfilling" thread |
84 |
> |
* sees that the slot is non-null, and so CASes it back to null, |
85 |
> |
* also exchanging items by CASing the hole, plus waking up the |
86 |
> |
* occupying thread if it is blocked. In each case CAS'es may |
87 |
> |
* fail because a slot at first appears non-null but is null upon |
88 |
> |
* CAS, or vice-versa. So threads may need to retry these |
89 |
> |
* actions. |
90 |
> |
* |
91 |
> |
* This simple approach works great when there are only a few |
92 |
> |
* threads using an Exchanger, but performance rapidly |
93 |
> |
* deteriorates due to CAS contention on the single slot when |
94 |
> |
* there are lots of threads using an exchanger. So instead we use |
95 |
> |
* an "arena"; basically a kind of hash table with a dynamically |
96 |
> |
* varying number of of slots, any one of which can be used by |
97 |
> |
* threads performing an exchange. Incoming threads pick slots |
98 |
> |
* based on a hash of their Thread ids. If an incoming thread |
99 |
> |
* fails to CAS in its chosen slot, it picks an alternative slot |
100 |
> |
* instead. And similarly from there. If a thread successfully |
101 |
> |
* CASes into a slot but no other thread arrives, it tries |
102 |
> |
* another, heading toward the zero slot, which always exists even |
103 |
> |
* if the table shrinks. The particular mechanics controlling this |
104 |
> |
* are as follows: |
105 |
> |
* |
106 |
> |
* Waiting: Slot zero is special in that it is the only slot that |
107 |
> |
* exists when there is no contention. A thread occupying slot |
108 |
> |
* zero will block if no thread fulfills it after a short spin. In |
109 |
> |
* other cases, occupying threads eventually give up and try |
110 |
> |
* another slot. Waiting threads spin for a while (a period that |
111 |
> |
* should be a little less than a typical context-switch time) |
112 |
> |
* before either blocking (if slot zero) or giving up (if other |
113 |
> |
* slots) and restarting. There is no reason for threads to block |
114 |
> |
* unless there are unlikely to be any other threads |
115 |
> |
* present. Occupants are mainly avoiding memory contention so sit |
116 |
> |
* there quietly polling for a shorter period than it would take |
117 |
> |
* to block and then unblock them. Non-slot-zero waits that elapse |
118 |
> |
* because of lack of other threads waste around one extra |
119 |
> |
* context-switch time per try, which is still on average much |
120 |
> |
* faster than alternative approaches. |
121 |
> |
* |
122 |
> |
* Sizing: Usually, using only a few slots suffices to reduce |
123 |
> |
* contention. Especially with small numbers of threads, using |
124 |
> |
* too many slots can lead to just as poor performance as using |
125 |
> |
* too few of them, and there's not much room for error. The |
126 |
> |
* variable "max" maintains the number of slots actually in |
127 |
> |
* use. It is increased when a thread sees too many CAS |
128 |
> |
* failures. (This is analogous to resizing a regular hash table |
129 |
> |
* based on a target load factor, except here, growth steps are |
130 |
> |
* just one-by one rather than proportional.) Growth requires |
131 |
> |
* contention failures in each of three tried slots. Requiring |
132 |
> |
* multiple failures for expansion copes with the fact that some |
133 |
> |
* failed CASes are not due to contention but instead to simple |
134 |
> |
* races between two threads or thread pre-emptions occurring |
135 |
> |
* between reading and CASing. Also, very transient peak |
136 |
> |
* contention can be much higher than the average sustainable |
137 |
> |
* levels. The max limit is decreased on average 50% of the times |
138 |
> |
* that a non-slot-zero wait elapses without being fulfilled. |
139 |
> |
* Threads experiencing elapsed waits move closer to zero, so |
140 |
> |
* eventually find existing (or future) threads even if the table |
141 |
> |
* has been shrunk due to inactivity. The chosen mechanics and |
142 |
> |
* thresholds for growing and shrinking are intrinsically |
143 |
> |
* entangled with indexing and hashing inside the exchange code, |
144 |
> |
* and can't be nicely abstracted out. |
145 |
> |
* |
146 |
> |
* Hashing: Each thread picks its initial slot to use in accord |
147 |
> |
* with a simple hashcode. The sequence is the same on each |
148 |
> |
* encounter by any given thread, but effectively random across |
149 |
> |
* threads. Using arenas encounters the classic cost vs quality |
150 |
> |
* tradeoffs of all hash tables. Here, we use a one-step FNV-1a |
151 |
> |
* hash code based on the current thread's Thread.getId(), along |
152 |
> |
* with a cheap approximation to a mod operation to select an |
153 |
> |
* index. The downside of optimizing index selection in this way |
154 |
> |
* is that the code is hardwired to use a maximum table size of |
155 |
> |
* 32. But this value more than suffices for known platforms and |
156 |
> |
* applications. |
157 |
> |
* |
158 |
> |
* Probing: On sensed contention of a selected slot, we probe |
159 |
> |
* sequentially through the table, analogously to linear probing |
160 |
> |
* after collision in a hash table. (We move circularly, in |
161 |
> |
* reverse order to mesh best with table growth and shrinkage |
162 |
> |
* rules.) Except that to minimize the effects of false-alarms |
163 |
> |
* and cache thrashing, we try the first selected slot twice |
164 |
> |
* before moving. |
165 |
> |
* |
166 |
> |
* Padding: Even with contention management, slots are heavily |
167 |
> |
* contended, so use cache-padding to avoid poor memory |
168 |
> |
* performance. Because of this, slots are lazily constructed only |
169 |
> |
* when used, to avoid wasting this space unnecessarily. While |
170 |
> |
* isolation of locations is not much of an issue at first in an |
171 |
> |
* application, as time goes on and garbage-collectors perform |
172 |
> |
* compaction, slots are very likely to be moved adjacent to each |
173 |
> |
* other, which can cause much thrashing of cache lines on MPs |
174 |
> |
* unless padding is employed. |
175 |
> |
* |
176 |
> |
* This is an improvement of the algorithm described in the paper |
177 |
> |
* "A Scalable Elimination-based Exchange Channel" by William |
178 |
> |
* Scherer, Doug Lea, and Michael Scott in Proceedings of SCOOL05 |
179 |
> |
* workshop. Available at: http://hdl.handle.net/1802/2104 |
180 |
|
*/ |
181 |
|
|
182 |
|
/** The number of CPUs, for sizing and spin control */ |
183 |
< |
static final int NCPUS = Runtime.getRuntime().availableProcessors(); |
183 |
> |
private static final int NCPU = Runtime.getRuntime().availableProcessors(); |
184 |
|
|
185 |
|
/** |
186 |
< |
* Size of collision space. Using a size of half the number of |
187 |
< |
* CPUs provides enough space for threads to find each other but |
188 |
< |
* not so much that it would always require one or more to time |
189 |
< |
* out to become unstuck. Note that the arena array holds SIZE+1 |
190 |
< |
* elements, to include the top-of-stack slot. Imposing a ceiling |
191 |
< |
* is suboptimal for huge machines, but bounds backoff times to |
192 |
< |
* acceptable values. To ensure max times less than 2.4 seconds, |
193 |
< |
* the ceiling value plus the shift value of backoff base (below) |
194 |
< |
* should be less than or equal to 31. |
186 |
> |
* The capacity of the arena. Set to a value that provides more |
187 |
> |
* than enough space to handle contention. On small machines most |
188 |
> |
* slots won't be used, but it is still not wasted because the |
189 |
> |
* extra space provides some machine-level address padding to |
190 |
> |
* minimize interference with heavily CAS'ed Slot locations. And |
191 |
> |
* on very large machines, performance eventually becomes bounded |
192 |
> |
* by memory bandwidth, not numbers of threads/CPUs. This |
193 |
> |
* constant cannot be changed without also modifying indexing and |
194 |
> |
* hashing algorithms. |
195 |
> |
*/ |
196 |
> |
private static final int CAPACITY = 32; |
197 |
> |
|
198 |
> |
/** |
199 |
> |
* The value of "max" that will hold all threads without |
200 |
> |
* contention. When this value is less than CAPACITY, some |
201 |
> |
* otherwise wasted expansion can be avoided. |
202 |
> |
*/ |
203 |
> |
private static final int FULL = |
204 |
> |
Math.max(0, Math.min(CAPACITY, NCPU / 2) - 1); |
205 |
> |
|
206 |
> |
/** |
207 |
> |
* The number of times to spin (doing nothing except polling a |
208 |
> |
* memory location) before blocking or giving up while waiting to |
209 |
> |
* be fulfilled. Should be zero on uniprocessors. On |
210 |
> |
* multiprocessors, this value should be large enough so that two |
211 |
> |
* threads exchanging items as fast as possible block only when |
212 |
> |
* one of them is stalled (due to GC or preemption), but not much |
213 |
> |
* longer, to avoid wasting CPU resources. Seen differently, this |
214 |
> |
* value is a little over half the number of cycles of an average |
215 |
> |
* context switch time on most systems. The value here is |
216 |
> |
* approximately the average of those across a range of tested |
217 |
> |
* systems. |
218 |
|
*/ |
219 |
< |
private static final int SIZE = Math.min(25, (NCPUS + 1) / 2); |
219 |
> |
private static final int SPINS = (NCPU == 1) ? 0 : 2000; |
220 |
|
|
221 |
|
/** |
222 |
< |
* Base unit in nanoseconds for backoffs. Must be a power of two. |
223 |
< |
* Should be small because backoffs exponentially increase from base. |
224 |
< |
* The value should be close to the round-trip time of a call to |
225 |
< |
* LockSupport.park in the case where some other thread has already |
226 |
< |
* called unpark. On multiprocessors, timed waits less than this value |
135 |
< |
* are implemented by spinning. |
222 |
> |
* The number of times to spin before blocking in timed waits. |
223 |
> |
* Timed waits spin more slowly because checking the time takes |
224 |
> |
* time. The best value relies mainly on the relative rate of |
225 |
> |
* System.nanoTime vs memory accesses. The value is empirically |
226 |
> |
* derived to work well across a variety of systems. |
227 |
|
*/ |
228 |
< |
static final long BACKOFF_BASE = (1L << 6); |
228 |
> |
private static final int TIMED_SPINS = SPINS / 20; |
229 |
|
|
230 |
|
/** |
231 |
< |
* The number of nanoseconds for which it is faster to spin rather |
232 |
< |
* than to use timed park. Should normally be zero on |
233 |
< |
* uniprocessors and BACKOFF_BASE on multiprocessors. |
231 |
> |
* Sentinel item representing cancellation of a wait due to |
232 |
> |
* interruption, timeout, or elapsed spin-waits. This value is |
233 |
> |
* placed in holes on cancellation, and used as a return value |
234 |
> |
* from waiting methods to indicate failure to set or get hole. |
235 |
|
*/ |
236 |
< |
static final long spinForTimeoutThreshold = (NCPUS < 2) ? 0 : BACKOFF_BASE; |
236 |
> |
private static final Object CANCEL = new Object(); |
237 |
|
|
238 |
|
/** |
239 |
< |
* The number of times to spin before blocking in timed waits. |
240 |
< |
* The value is empirically derived -- it works well across a |
241 |
< |
* variety of processors and OSes. Empirically, the best value |
150 |
< |
* seems not to vary with number of CPUs (beyond 2) so is just |
151 |
< |
* a constant. |
239 |
> |
* Value representing null arguments/returns from public |
240 |
> |
* methods. This disambiguates from internal requirement that |
241 |
> |
* holes start out as null to mean they are not yet set. |
242 |
|
*/ |
243 |
< |
static final int maxTimedSpins = (NCPUS < 2) ? 0 : 16; |
243 |
> |
private static final Object NULL_ITEM = new Object(); |
244 |
|
|
245 |
|
/** |
246 |
< |
* The number of times to spin before blocking in untimed waits. |
247 |
< |
* This is greater than timed value because untimed waits spin |
248 |
< |
* faster since they don't need to check times on each spin. |
246 |
> |
* Nodes hold partially exchanged data. This class |
247 |
> |
* opportunistically subclasses AtomicReference to represent the |
248 |
> |
* hole. So get() returns hole, and compareAndSet CAS'es value |
249 |
> |
* into hole. This class cannot be parameterized as "V" because of |
250 |
> |
* the use of non-V CANCEL sentinels. |
251 |
|
*/ |
252 |
< |
static final int maxUntimedSpins = maxTimedSpins * 32; |
252 |
> |
private static final class Node extends AtomicReference<Object> { |
253 |
> |
/** The element offered by the Thread creating this node. */ |
254 |
> |
public final Object item; |
255 |
|
|
256 |
< |
/** |
257 |
< |
* Sentinel item representing cancellation. This value is placed |
258 |
< |
* in holes on cancellation, and used as a return value from Node |
259 |
< |
* methods to indicate failure to set or get hole. |
260 |
< |
*/ |
261 |
< |
static final Object FAIL = new Object(); |
256 |
> |
/** The Thread waiting to be signalled; null until waiting. */ |
257 |
> |
public volatile Thread waiter; |
258 |
> |
|
259 |
> |
/** |
260 |
> |
* Creates node with given item and empty hole. |
261 |
> |
* @param item the item |
262 |
> |
*/ |
263 |
> |
public Node(Object item) { |
264 |
> |
this.item = item; |
265 |
> |
} |
266 |
> |
} |
267 |
|
|
268 |
|
/** |
269 |
< |
* The collision arena. arena[0] is used as the top of the stack. |
270 |
< |
* The remainder is used as the collision elimination space. |
271 |
< |
*/ |
272 |
< |
private final AtomicReference<Node>[] arena; |
269 |
> |
* A Slot is an AtomicReference with heuristic padding to lessen |
270 |
> |
* cache effects of this heavily CAS'ed location. While the |
271 |
> |
* padding adds noticeable space, all slots are created only on |
272 |
> |
* demand, and there will be more than one of them only when it |
273 |
> |
* would improve throughput more than enough to outweigh using |
274 |
> |
* extra space. |
275 |
> |
*/ |
276 |
> |
private static final class Slot extends AtomicReference<Object> { |
277 |
> |
// Improve likelihood of isolation on <= 64 byte cache lines |
278 |
> |
long q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, qa, qb, qc, qd, qe; |
279 |
> |
} |
280 |
|
|
281 |
|
/** |
282 |
< |
* Per-thread random number generator. Because random numbers |
283 |
< |
* are used to choose slots and delays to reduce contention, the |
178 |
< |
* random number generator itself cannot introduce contention. |
179 |
< |
* And the statistical quality of the generator is not too |
180 |
< |
* important. So we use a custom cheap generator, and maintain |
181 |
< |
* it as a thread local. |
282 |
> |
* Slot array. Elements are lazily initialized when needed. |
283 |
> |
* Declared volatile to enable double-checked lazy construction. |
284 |
|
*/ |
285 |
< |
private static final ThreadLocal<RNG> random = new ThreadLocal<RNG>() { |
184 |
< |
public RNG initialValue() { return new RNG(); } }; |
285 |
> |
private volatile Slot[] arena = new Slot[CAPACITY]; |
286 |
|
|
287 |
|
/** |
288 |
< |
* Creates a new Exchanger. |
288 |
> |
* The maximum slot index being used. The value sometimes |
289 |
> |
* increases when a thread experiences too many CAS contentions, |
290 |
> |
* and sometimes decreases when a backoff wait elapses. Changes |
291 |
> |
* are performed only via compareAndSet, to avoid stale values |
292 |
> |
* when a thread happens to stall right before setting. |
293 |
|
*/ |
294 |
< |
public Exchanger() { |
190 |
< |
arena = (AtomicReference<Node>[]) new AtomicReference[SIZE + 1]; |
191 |
< |
for (int i = 0; i < arena.length; ++i) |
192 |
< |
arena[i] = new AtomicReference<Node>(); |
193 |
< |
} |
294 |
> |
private final AtomicInteger max = new AtomicInteger(); |
295 |
|
|
296 |
|
/** |
297 |
|
* Main exchange function, handling the different policy variants. |
298 |
|
* Uses Object, not "V" as argument and return value to simplify |
299 |
< |
* handling of internal sentinel values. Callers from public |
300 |
< |
* methods cast accordingly. |
299 |
> |
* handling of sentinel values. Callers from public methods decode |
300 |
> |
* and cast accordingly. |
301 |
|
* |
302 |
< |
* @param item the item to exchange |
302 |
> |
* @param item the (nonnull) item to exchange |
303 |
|
* @param timed true if the wait is timed |
304 |
|
* @param nanos if timed, the maximum wait time |
305 |
< |
* @return the other thread's item |
305 |
> |
* @return the other thread's item, or CANCEL if interrupted or timed out. |
306 |
|
*/ |
307 |
< |
private Object doExchange(Object item, boolean timed, long nanos) |
308 |
< |
throws InterruptedException, TimeoutException { |
309 |
< |
long lastTime = timed ? System.nanoTime() : 0; |
310 |
< |
int idx = 0; // start out at slot representing top |
311 |
< |
int backoff = 0; // increases on failure to occupy a slot |
312 |
< |
Node me = new Node(item); |
313 |
< |
|
314 |
< |
for (;;) { |
315 |
< |
AtomicReference<Node> slot = arena[idx]; |
316 |
< |
Node you = slot.get(); |
317 |
< |
|
318 |
< |
// Try to occupy this slot |
319 |
< |
if (you == null && slot.compareAndSet(null, me)) { |
320 |
< |
// If this is top slot, use regular wait, else backoff-wait |
321 |
< |
Object v = ((idx == 0)? |
322 |
< |
me.waitForHole(timed, nanos) : |
323 |
< |
me.waitForHole(true, randomDelay(backoff))); |
324 |
< |
if (slot.get() == me) |
325 |
< |
slot.compareAndSet(me, null); |
326 |
< |
if (v != FAIL) |
307 |
> |
private Object doExchange(Object item, boolean timed, long nanos) { |
308 |
> |
Node me = new Node(item); // Create in case occupying |
309 |
> |
int index = hashIndex(); // Index of current slot |
310 |
> |
int fails = 0; // Number of CAS failures |
311 |
> |
|
312 |
> |
for (;;) { |
313 |
> |
Object y; // Contents of current slot |
314 |
> |
Slot slot = arena[index]; |
315 |
> |
if (slot == null) // Lazily initialize slots |
316 |
> |
createSlot(index); // Continue loop to reread |
317 |
> |
else if ((y = slot.get()) != null && // Try to fulfill |
318 |
> |
slot.compareAndSet(y, null)) { |
319 |
> |
Node you = (Node)y; // Transfer item |
320 |
> |
if (you.compareAndSet(null, me.item)) { |
321 |
> |
LockSupport.unpark(you.waiter); |
322 |
> |
return you.item; |
323 |
> |
} // Else cancelled; continue |
324 |
> |
} |
325 |
> |
else if (y == null && // Try to occupy |
326 |
> |
slot.compareAndSet(null, me)) { |
327 |
> |
if (index == 0) // Blocking wait for slot 0 |
328 |
> |
return timed? awaitNanos(me, slot, nanos): await(me, slot); |
329 |
> |
Object v = spinWait(me, slot); // Spin wait for non-0 |
330 |
> |
if (v != CANCEL) |
331 |
|
return v; |
332 |
< |
if (Thread.interrupted()) |
333 |
< |
throw new InterruptedException(); |
334 |
< |
if (timed) { |
335 |
< |
long now = System.nanoTime(); |
231 |
< |
nanos -= now - lastTime; |
232 |
< |
lastTime = now; |
233 |
< |
if (nanos <= 0) |
234 |
< |
throw new TimeoutException(); |
235 |
< |
} |
236 |
< |
|
237 |
< |
me = new Node(me.item); // Throw away nodes on failure |
238 |
< |
if (backoff < SIZE - 1) // Increase or stay saturated |
239 |
< |
++backoff; |
240 |
< |
idx = 0; // Restart at top |
241 |
< |
continue; |
332 |
> |
me = new Node(me.item); // Throw away cancelled node |
333 |
> |
int m = max.get(); |
334 |
> |
if (m > (index >>>= 1)) // Decrease index |
335 |
> |
max.compareAndSet(m, m - 1); // Maybe shrink table |
336 |
|
} |
337 |
< |
|
338 |
< |
// Try to release waiter from apparently non-empty slot |
339 |
< |
if (you != null || (you = slot.get()) != null) { |
340 |
< |
boolean success = (you.get() == null && |
341 |
< |
you.compareAndSet(null, me.item)); |
342 |
< |
if (slot.get() == you) |
249 |
< |
slot.compareAndSet(you, null); |
250 |
< |
if (success) { |
251 |
< |
you.signal(); |
252 |
< |
return you.item; |
253 |
< |
} |
337 |
> |
else if (++fails > 1) { // Allow 2 fails on 1st slot |
338 |
> |
int m = max.get(); |
339 |
> |
if (fails > 3 && m < FULL && max.compareAndSet(m, m + 1)) |
340 |
> |
index = m + 1; // Grow on 3rd failed slot |
341 |
> |
else if (--index < 0) |
342 |
> |
index = m; // Circularly traverse |
343 |
|
} |
255 |
– |
|
256 |
– |
// Retry with a random non-top slot <= backoff |
257 |
– |
idx = backoff == 0 ? 1 : 1 + random.get().next() % (backoff + 1); |
344 |
|
} |
345 |
|
} |
346 |
|
|
347 |
|
/** |
348 |
< |
* Returns a random delay less than (base times (2 raised to backoff)). |
349 |
< |
*/ |
350 |
< |
private long randomDelay(int backoff) { |
351 |
< |
return ((BACKOFF_BASE << backoff) - 1) & random.get().next(); |
348 |
> |
* Returns a hash index for current thread. Uses a one-step |
349 |
> |
* FNV-1a hash code (http://www.isthe.com/chongo/tech/comp/fnv/) |
350 |
> |
* based on the current thread's Thread.getId(). These hash codes |
351 |
> |
* have more uniform distribution properties with respect to small |
352 |
> |
* moduli (here 1-31) than do other simple hashing functions. To |
353 |
> |
* return an index between 0 and max, we use a cheap approximation |
354 |
> |
* to a mod operation, that also corrects for bias due to |
355 |
> |
* non-power-of-2 remaindering (see {@link |
356 |
> |
* java.util.Random#nextInt}). Bits of the hashcode are masked |
357 |
> |
* with "nbits", the ceiling power of two of table size (looked up |
358 |
> |
* in a table packed into three ints). If too large, this is |
359 |
> |
* retried after rotating the hash by nbits bits, while forcing |
360 |
> |
* new top bit to 0, which guarantees eventual termination |
361 |
> |
* (although with a non-random-bias). This requires an average of |
362 |
> |
* less than 2 tries for all table sizes, and has a maximum 2% |
363 |
> |
* difference from perfectly uniform slot probabilities when |
364 |
> |
* applied to all possible hash codes for sizes less than 32. |
365 |
> |
* |
366 |
> |
* @return a per-thread-random index, 0 <= index < max |
367 |
> |
*/ |
368 |
> |
private final int hashIndex() { |
369 |
> |
long id = Thread.currentThread().getId(); |
370 |
> |
int hash = (((int)(id ^ (id >>> 32))) ^ 0x811c9dc5) * 0x01000193; |
371 |
> |
|
372 |
> |
int m = max.get(); |
373 |
> |
int nbits = (((0xfffffc00 >> m) & 4) | // Compute ceil(log2(m+1)) |
374 |
> |
((0x000001f8 >>> m) & 2) | // The constants hold |
375 |
> |
((0xffff00f2 >>> m) & 1)); // a lookup table |
376 |
> |
int index; |
377 |
> |
while ((index = hash & ((1 << nbits) - 1)) > m) // May retry on |
378 |
> |
hash = (hash >>> nbits) | (hash << (33 - nbits)); // non-power-2 m |
379 |
> |
return index; |
380 |
|
} |
381 |
|
|
382 |
|
/** |
383 |
< |
* Nodes hold partially exchanged data. This class |
384 |
< |
* opportunistically subclasses AtomicReference to represent the |
385 |
< |
* hole. So get() returns hole, and compareAndSet CAS'es value |
386 |
< |
* into hole. Note that this class cannot be parameterized as V |
387 |
< |
* because the sentinel value FAIL is only of type Object. |
388 |
< |
*/ |
389 |
< |
static final class Node extends AtomicReference<Object> { |
390 |
< |
private static final long serialVersionUID = -3221313401284163686L; |
383 |
> |
* Creates a new slot at given index. Called only when the slot |
384 |
> |
* appears to be null. Relies on double-check using builtin locks, |
385 |
> |
* since they rarely contend. |
386 |
> |
* |
387 |
> |
* @param index the index to add slot at |
388 |
> |
*/ |
389 |
> |
private void createSlot(int index) { |
390 |
> |
// Create slot outside of lock to narrow sync region |
391 |
> |
Slot newSlot = new Slot(); |
392 |
> |
Slot[] a = arena; |
393 |
> |
synchronized(a) { |
394 |
> |
if (a[index] == null) |
395 |
> |
a[index] = newSlot; |
396 |
> |
} |
397 |
> |
} |
398 |
|
|
399 |
< |
/** The element offered by the Thread creating this node. */ |
400 |
< |
final Object item; |
399 |
> |
/** |
400 |
> |
* Try to cancel a wait for the given node waiting in the given |
401 |
> |
* slot, if so, helping clear the node from its slot to avoid |
402 |
> |
* garbage retention. |
403 |
> |
* |
404 |
> |
* @param node the waiting node |
405 |
> |
* @param the slot it is waiting in |
406 |
> |
* @return true if successfully cancelled |
407 |
> |
*/ |
408 |
> |
private static boolean tryCancel(Node node, Slot slot) { |
409 |
> |
if (!node.compareAndSet(null, CANCEL)) |
410 |
> |
return false; |
411 |
> |
if (slot.get() == node) |
412 |
> |
slot.compareAndSet(node, null); |
413 |
> |
return true; |
414 |
> |
} |
415 |
|
|
416 |
< |
/** The Thread waiting to be signalled; null until waiting. */ |
417 |
< |
volatile Thread waiter; |
416 |
> |
// Three forms of waiting. Each just different enough not to merge |
417 |
> |
// code with others. |
418 |
|
|
419 |
< |
/** |
420 |
< |
* Creates node with given item and empty hole. |
421 |
< |
* |
422 |
< |
* @param item the item |
423 |
< |
*/ |
424 |
< |
Node(Object item) { |
425 |
< |
this.item = item; |
419 |
> |
/** |
420 |
> |
* Spin-waits for hole for a non-0 slot. Fails if spin elapses |
421 |
> |
* before hole filled. Does not check interrupt, relying on check |
422 |
> |
* in public exchange method to abort if interrupted on entry. |
423 |
> |
* |
424 |
> |
* @param node the waiting node |
425 |
> |
* @return on success, the hole; on failure, CANCEL |
426 |
> |
*/ |
427 |
> |
private static Object spinWait(Node node, Slot slot) { |
428 |
> |
int spins = SPINS; |
429 |
> |
for (;;) { |
430 |
> |
Object v = node.get(); |
431 |
> |
if (v != null) |
432 |
> |
return v; |
433 |
> |
else if (spins > 0) |
434 |
> |
--spins; |
435 |
> |
else |
436 |
> |
tryCancel(node, slot); |
437 |
|
} |
438 |
+ |
} |
439 |
|
|
440 |
< |
/** |
441 |
< |
* Unparks thread if it is waiting. |
442 |
< |
*/ |
443 |
< |
void signal() { |
444 |
< |
LockSupport.unpark(waiter); |
440 |
> |
/** |
441 |
> |
* Waits for (by spinning and/or blocking) and gets the hole |
442 |
> |
* filled in by another thread. Fails if or interrupted before |
443 |
> |
* hole filled. |
444 |
> |
* |
445 |
> |
* When a node/thread is about to block, it sets its waiter field |
446 |
> |
* and then rechecks state at least one more time before actually |
447 |
> |
* parking, thus covering race vs fulfiller noticing that waiter |
448 |
> |
* is non-null so should be woken. |
449 |
> |
* |
450 |
> |
* Thread interruption status is checked only surrounding calls to |
451 |
> |
* park. The caller is assumed to have checked interrupt status |
452 |
> |
* on entry. |
453 |
> |
* |
454 |
> |
* @param node the waiting node |
455 |
> |
* @return on success, the hole; on failure, CANCEL |
456 |
> |
*/ |
457 |
> |
private static Object await(Node node, Slot slot) { |
458 |
> |
Thread w = Thread.currentThread(); |
459 |
> |
int spins = SPINS; |
460 |
> |
for (;;) { |
461 |
> |
Object v = node.get(); |
462 |
> |
if (v != null) |
463 |
> |
return v; |
464 |
> |
else if (spins > 0) // Spin-wait phase |
465 |
> |
--spins; |
466 |
> |
else if (node.waiter == null) // Set up to block next |
467 |
> |
node.waiter = w; |
468 |
> |
else if (w.isInterrupted()) // Abort on interrupt |
469 |
> |
tryCancel(node, slot); |
470 |
> |
else // Block |
471 |
> |
LockSupport.park(node); |
472 |
|
} |
473 |
+ |
} |
474 |
|
|
475 |
< |
/** |
476 |
< |
* Waits for and gets the hole filled in by another thread. |
477 |
< |
* Fails if timed out or interrupted before hole filled. |
478 |
< |
* |
479 |
< |
* @param timed true if the wait is timed |
480 |
< |
* @param nanos if timed, the maximum wait time |
481 |
< |
* @return on success, the hole; on failure, FAIL |
482 |
< |
*/ |
483 |
< |
Object waitForHole(boolean timed, long nanos) { |
484 |
< |
long lastTime = timed ? System.nanoTime() : 0; |
485 |
< |
int spins = timed ? maxTimedSpins : maxUntimedSpins; |
486 |
< |
Thread w = Thread.currentThread(); |
487 |
< |
for (;;) { |
488 |
< |
if (w.isInterrupted()) |
489 |
< |
compareAndSet(null, FAIL); |
490 |
< |
Object h = get(); |
491 |
< |
if (h != null) |
492 |
< |
return h; |
493 |
< |
if (timed) { |
494 |
< |
long now = System.nanoTime(); |
495 |
< |
nanos -= now - lastTime; |
496 |
< |
lastTime = now; |
497 |
< |
if (nanos <= 0) { |
498 |
< |
compareAndSet(null, FAIL); |
324 |
< |
continue; |
325 |
< |
} |
326 |
< |
} |
475 |
> |
/** |
476 |
> |
* Waits for (at index 0) and gets the hole filled in by another |
477 |
> |
* thread. Fails if timed out or interrupted before hole filled. |
478 |
> |
* Same basic logic as untimed version, but a bit messier. |
479 |
> |
* |
480 |
> |
* @param node the waiting node |
481 |
> |
* @param nanos the wait time |
482 |
> |
* @return on success, the hole; on failure, CANCEL |
483 |
> |
*/ |
484 |
> |
private Object awaitNanos(Node node, Slot slot, long nanos) { |
485 |
> |
int spins = TIMED_SPINS; |
486 |
> |
long lastTime = 0; |
487 |
> |
Thread w = null; |
488 |
> |
for (;;) { |
489 |
> |
Object v = node.get(); |
490 |
> |
if (v != null) |
491 |
> |
return v; |
492 |
> |
long now = System.nanoTime(); |
493 |
> |
if (w == null) |
494 |
> |
w = Thread.currentThread(); |
495 |
> |
else |
496 |
> |
nanos -= now - lastTime; |
497 |
> |
lastTime = now; |
498 |
> |
if (nanos > 0) { |
499 |
|
if (spins > 0) |
500 |
|
--spins; |
501 |
< |
else if (waiter == null) |
502 |
< |
waiter = w; |
503 |
< |
else if (!timed) |
504 |
< |
LockSupport.park(this); |
505 |
< |
else if (nanos > spinForTimeoutThreshold) |
506 |
< |
LockSupport.parkNanos(this, nanos); |
501 |
> |
else if (node.waiter == null) |
502 |
> |
node.waiter = w; |
503 |
> |
else if (w.isInterrupted()) |
504 |
> |
tryCancel(node, slot); |
505 |
> |
else |
506 |
> |
LockSupport.parkNanos(node, nanos); |
507 |
|
} |
508 |
+ |
else if (tryCancel(node, slot) && !w.isInterrupted()) |
509 |
+ |
return scanOnTimeout(node); |
510 |
|
} |
511 |
|
} |
512 |
|
|
513 |
|
/** |
514 |
+ |
* Sweeps through arena checking for any waiting threads. Called |
515 |
+ |
* only upon return from timeout while waiting in slot 0. When a |
516 |
+ |
* thread gives up on a timed wait, it is possible that a |
517 |
+ |
* previously-entered thread is still waiting in some other |
518 |
+ |
* slot. So we scan to check for any. This is almost always |
519 |
+ |
* overkill, but decreases the likelihood of timeouts when there |
520 |
+ |
* are other threads present to far less than that in lock-based |
521 |
+ |
* exchangers in which earlier-arriving threads may still be |
522 |
+ |
* waiting on entry locks. |
523 |
+ |
* |
524 |
+ |
* @param node the waiting node |
525 |
+ |
* @return another thread's item, or CANCEL |
526 |
+ |
*/ |
527 |
+ |
private Object scanOnTimeout(Node node) { |
528 |
+ |
Object y; |
529 |
+ |
for (int j = arena.length - 1; j >= 0; --j) { |
530 |
+ |
Slot slot = arena[j]; |
531 |
+ |
if (slot != null) { |
532 |
+ |
while ((y = slot.get()) != null) { |
533 |
+ |
if (slot.compareAndSet(y, null)) { |
534 |
+ |
Node you = (Node)y; |
535 |
+ |
if (you.compareAndSet(null, node.item)) { |
536 |
+ |
LockSupport.unpark(you.waiter); |
537 |
+ |
return you.item; |
538 |
+ |
} |
539 |
+ |
} |
540 |
+ |
} |
541 |
+ |
} |
542 |
+ |
} |
543 |
+ |
return CANCEL; |
544 |
+ |
} |
545 |
+ |
|
546 |
+ |
/** |
547 |
+ |
* Creates a new Exchanger. |
548 |
+ |
*/ |
549 |
+ |
public Exchanger() { |
550 |
+ |
} |
551 |
+ |
|
552 |
+ |
/** |
553 |
|
* Waits for another thread to arrive at this exchange point (unless |
554 |
|
* the current thread is {@link Thread#interrupt interrupted}), |
555 |
|
* and then transfers the given object to it, receiving its object |
583 |
|
* interrupted while waiting |
584 |
|
*/ |
585 |
|
public V exchange(V x) throws InterruptedException { |
586 |
< |
try { |
587 |
< |
return (V)doExchange(x, false, 0); |
588 |
< |
} catch (TimeoutException cannotHappen) { |
589 |
< |
throw new Error(cannotHappen); |
586 |
> |
if (!Thread.interrupted()) { |
587 |
> |
Object v = doExchange(x == null? NULL_ITEM : x, false, 0); |
588 |
> |
if (v == NULL_ITEM) |
589 |
> |
return null; |
590 |
> |
if (v != CANCEL) |
591 |
> |
return (V)v; |
592 |
> |
Thread.interrupted(); // Clear interrupt status on IE throw |
593 |
|
} |
594 |
+ |
throw new InterruptedException(); |
595 |
|
} |
596 |
|
|
597 |
|
/** |
623 |
|
* then {@link InterruptedException} is thrown and the current thread's |
624 |
|
* interrupted status is cleared. |
625 |
|
* |
626 |
< |
* <p>If the specified waiting time elapses then {@link TimeoutException} |
627 |
< |
* is thrown. |
628 |
< |
* If the time is |
412 |
< |
* less than or equal to zero, the method will not wait at all. |
626 |
> |
* <p>If the specified waiting time elapses then {@link |
627 |
> |
* TimeoutException} is thrown. If the time is less than or equal |
628 |
> |
* to zero, the method will not wait at all. |
629 |
|
* |
630 |
|
* @param x the object to exchange |
631 |
|
* @param timeout the maximum time to wait |
638 |
|
*/ |
639 |
|
public V exchange(V x, long timeout, TimeUnit unit) |
640 |
|
throws InterruptedException, TimeoutException { |
641 |
< |
return (V)doExchange(x, true, unit.toNanos(timeout)); |
642 |
< |
} |
643 |
< |
|
644 |
< |
/** |
645 |
< |
* Cheap XorShift random number generator used for determining |
646 |
< |
* elimination array slots and backoff delays. This uses the |
647 |
< |
* simplest of the generators described in George Marsaglia's |
648 |
< |
* "Xorshift RNGs" paper. This is not a high-quality generator |
649 |
< |
* but is acceptable here. |
434 |
< |
*/ |
435 |
< |
static final class RNG { |
436 |
< |
/** Use java.util.Random as seed generator for new RNGs. */ |
437 |
< |
private static final Random seedGenerator = new Random(); |
438 |
< |
private int seed = seedGenerator.nextInt() | 1; |
439 |
< |
|
440 |
< |
/** |
441 |
< |
* Returns random nonnegative integer. |
442 |
< |
*/ |
443 |
< |
int next() { |
444 |
< |
int x = seed; |
445 |
< |
x ^= x << 6; |
446 |
< |
x ^= x >>> 21; |
447 |
< |
seed = x ^= x << 7; |
448 |
< |
return x & 0x7FFFFFFF; |
641 |
> |
if (!Thread.interrupted()) { |
642 |
> |
Object v = doExchange(x == null? NULL_ITEM : x, |
643 |
> |
true, unit.toNanos(timeout)); |
644 |
> |
if (v == NULL_ITEM) |
645 |
> |
return null; |
646 |
> |
if (v != CANCEL) |
647 |
> |
return (V)v; |
648 |
> |
if (!Thread.interrupted()) |
649 |
> |
throw new TimeoutException(); |
650 |
|
} |
651 |
+ |
throw new InterruptedException(); |
652 |
|
} |
451 |
– |
|
653 |
|
} |