/* * Written by Doug and released to the public domain, as explained at * http://creativecommons.org/licenses/publicdomain */ /** * Adaptations and extensions of Ternary Search Trie * algorithms presented by Bentley and Sedgewick, * * Fast Algorithms for Sorting and Searching Strings, * Eighth Annual ACM-SIAM Symposium on Discrete Algorithms * New Orleans, January, 1997. * * last update Mon Sep 4 16:29:44 2000 Doug Lea (dl at gee) * Todo: SortedSet ops, maybe balancing, map version. * * Notes: This was written pre-JDK1.5, and so, the tie-in tactic for * claiming they are collections is a little sleazy, but probably the * best that can be done: Attempts to add a non-String produce * UnsupportedOperationException. * * In very light tests, this seems about 25% faster than redblack * trees of Strings in most cases, although the worst-case isn't so * good. And unfortunately, the worst case is a typical one: inserting * words that are already in sorted order. I ought to do something * about this. **/ import java.util.*; public class TernarySearchTrieSet extends AbstractSet implements Cloneable, java.io.Serializable { /** * An artifical string terminator; must be less than any valid char. * To allow any char to appear in a string (even 0), we use * an int. Since chars are unsigned, any negative value will do. **/ protected static final int TERMINATOR = -1; /** * Return i'th char of s, or TERMINATOR, if no such. * This allows us to pretend that all strings are padded with TERMINATOR, * which makes the rest of the code much simpler. **/ protected static int getChar(int i, String s) { return (i >= s.length())? TERMINATOR : s.charAt(i); } /** * Node class for tries **/ protected static class Node { /** The string, used only if a terminal node **/ String string; /** character to compare to at this level **/ int split; /** Nodes less than split **/ Node left; /** Nodes greater than split **/ Node right; /** Nodes prefixed by split **/ Node eq; protected Node(int c) { split = c; } } /** * The number of elements **/ private transient int count; /** * The number of structural modifications to the tree. */ private transient int modCount; /** * The root. Null when empty. **/ private transient Node root; /** * Constructs a new, empty set */ public TernarySearchTrieSet() { } /** * Returns the number of elements in this set (its cardinality). * * @return the number of elements in this set (its cardinality). */ public int size() { return count; } /** * Returns true if this set contains no elements. * * @return true if this set contains no elements. */ public boolean isEmpty() { return count == 0; } /** * Returns true if this set contains the specified element. * * @param x element whose presence in this set is to be tested. * @return true if this set contains the specified element. */ public boolean contains(Object x) { if (!(x instanceof String)) return false; String s = (String)x; int i = 0; int c = getChar(i, s); Node p = root; for (;;) { if (p == null) return false; else if (c == p.split) { if (c == TERMINATOR) return true; else { c = getChar(++i, s); p = p.eq; } } else if (c < p.split) p = p.left; else p = p.right; } } /** * Adds the specified element to this set if it is not already * present. Only Strings may be added. * * @param x element to be added to this set. * @return true if the set did not already contain the specified * element. * @throws UnsupportedOperationException if x is not a String. */ public boolean xadd(Object x) { if (!(x instanceof String)) throw new UnsupportedOperationException(); String s = (String)x; int i = 0; int c = getChar(i, s); if (root == null) root = new Node(c); Node p = root; for (;;) { if (c < p.split) { if (p.left == null) p.left = new Node(c); p = p.left; } else if (c > p.split) { if (p.right == null) p.right = new Node(c); p = p.right; } else if (c != TERMINATOR) { c = getChar(++i, s); if (p.eq == null) p.eq = new Node(c); p = p.eq; } else if (p.string != null) return false; else { p.string = s; ++count; ++modCount; return true; } } } public boolean add(Object x) { if (!(x instanceof String)) throw new UnsupportedOperationException(); String s = (String)x; int i = 0; int c = getChar(i, s); if (root == null) root = new Node(c); Node p = root; Node parent = null; for (;;) { if (c < p.split) { if (p.left == null) p.left = new Node(c); parent = p; p = p.left; } else if (c > p.split) { if (p.right == null) { Node n = new Node(c); if (p.split == TERMINATOR) { n.left = p; if (parent == null) root = n; else if (p == parent.left) parent.left = n; else if (p == parent.right) parent.right = n; else parent.eq = n; p = n; } else { p.right = n; parent = p; p = n; } } else { parent = p; p = p.right; } } else if (c != TERMINATOR) { c = getChar(++i, s); if (p.eq == null) p.eq = new Node(c); parent = p; p = p.eq; } else if (p.string != null) return false; else { p.string = s; ++count; ++modCount; return true; } } } public boolean xxadd(Object x) { if (!(x instanceof String)) throw new UnsupportedOperationException(); String s = (String)x; int i = 0; int c = getChar(i, s); if (root == null) root = new Node(c); Node p = root; Node parent = null; for (;;) { if (c < p.split) { if (p.left == null) { Node n = new Node(c); if (false && p.split != TERMINATOR && p.split > 'l') { n.right = p; if (parent == null) root = n; else if (p == parent.left) parent.left = n; else if (p == parent.right) parent.right = n; else parent.eq = n; p = n; } else { p.left = n; parent = p; p = n; } } else { parent = p; p = p.left; } } else if (c > p.split) { if (p.right == null) { Node n = new Node(c); if (p.split == TERMINATOR || p.split < 'l') { n.left = p; if (parent == null) root = n; else if (p == parent.left) parent.left = n; else if (p == parent.right) parent.right = n; else parent.eq = n; p = n; } else { p.right = n; parent = p; p = n; } } else { parent = p; p = p.right; } } else if (c != TERMINATOR) { c = getChar(++i, s); if (p.eq == null) p.eq = new Node(c); parent = p; p = p.eq; } else if (p.string != null) return false; else { p.string = s; ++count; ++modCount; return true; } } } /** * Removes the given element from this set if it is present. * * @param x object to be removed from this set, if present. * @return true if the set contained the specified element. */ public boolean remove(Object x) { if (!(x instanceof String)) return false; String s = (String)x; int mc = modCount; root = del(root, s, 0); return (modCount == mc+1); } /** * Recursive helper method for remove(); * This probably wouldn't be noticeably faster, but would be * more complicated, if done nonrecursively. **/ private Node del(Node p, String s, int i) { if (p == null) return null; int c = getChar(i, s); if (c < p.split) p.left = del(p.left, s, i); else if (c > p.split) p.right = del(p.right, s, i); else if (c != TERMINATOR) p.eq = del(p.eq, s, i+1); else if (p.string != null) { p.string = null; --count; ++modCount; } // Unlink unnecessary nodes on way out of recursion if (p.eq == null && p.string == null) { if (p.right == null) return p.left; else if (p.left == null) return p.right; else { // Replace node with predecessor Node lpar = p.left; if (lpar.right == null) { // Special-case when p.left is pred lpar.right = p.right; return lpar; } else { // Copy in pred's fields, and override links Node pred = lpar; while (pred.right != null) { lpar = pred; pred = pred.right; } lpar.right = pred.left; p.split = pred.split; p.string = pred.string; p.eq = pred.eq; return p; } } } else return p; } /** * Removes all of the elements from this set. */ public void clear() { root = null; count = 0; ++modCount; } /** * Return a list containing matches to pattern. * The pattern may contain number of occurrences * of a "don't care" character, that matches any character in * that position. For example, * partialMatches(".o.o.o", '.') might return [Pocono, rococo]. * @param target - the pattern string * @param dontcare - the character to treat as a don't care symbol in target. * **/ public List partialMatches(String pattern, char dontcare) { ArrayList list = new ArrayList(); pmsearch(root, pattern, 0, dontcare, list); return list; } /** * Equivalent to partialMatches(pattern, '.'); **/ public List partialMatches(String pattern) { ArrayList list = new ArrayList(); pmsearch(root, pattern, 0, '.', list); return list; } private void pmsearch(Node p, String s, int i, char dontcare, ArrayList list) { if (p == null) return; int c = getChar(i, s); if (c == dontcare || c < p.split) pmsearch(p.left, s, i, dontcare, list); if ( (c == dontcare || c == p.split) && (p.split != TERMINATOR && c != TERMINATOR)) pmsearch(p.eq, s, i+1, dontcare, list); if (c == TERMINATOR && c == p.split) list.add(p.string); if (c == dontcare || c > p.split) pmsearch(p.right, s, i, dontcare, list); } /** * Return a list contining all words within a given * Hamming distance of target word. For example, (assuming that * these words are in the set) * nearMatches("soda", 0) returns [soda]
* nearMatches("soda", 1) returns [coda, sod, soda, sods, sofa, * soma, sora, soya]
* nearMatches("soda", 2) returns [Aida, Boca, Cody, Dada, Dodd, ... * bode, body, bona, coca, cod, coda, code, cola, coma, dodo, god, gods, ... * pods, rod, rode, rods, sad, saga, sedan, sera, side, sima, siva, so, ... * sow, sown, soy, soya, sud, suds, today, yoga]. **/ public List nearMatches(String target, int HammingDistance) { ArrayList list = new ArrayList(); if (HammingDistance >= 0) nsearch(root, target, 0, HammingDistance, list); return list; } private void nsearch(Node p, String s, int i, int d, ArrayList list) { if (p == null) return; int c = getChar(i, s); if (d > 0 || c < p.split) nsearch(p.left, s, i, d, list); if (p.split == TERMINATOR) { if (s.length() - i <= d) list.add(p.string); } else { nsearch(p.eq, s, (c == TERMINATOR)? i : (i+1), (c == p.split)? d : (d-1), list); } if (d > 0 || c > p.split) nsearch(p.right, s, i, d, list); } /** * Returns an iterator over the elements in this set. The elements * are returned in String order. * * @return an Iterator over the elements in this set. * @see ConcurrentModificationException */ public Iterator iterator() { return new TernarySearchTrieSetIterator(); } /** * Iterator for TernarySearchTrieSet **/ private class TernarySearchTrieSetIterator implements Iterator { /** * A stack is needed to track traversal state. Stack elements * logically consist of a node, plus a record of whether * only the left or the left+eq children have been traversed yet. * It's a little cheaper to use parallel arrays for nodes and * positions than it would be to create a separate bookkeeping class. **/ private static final int INITIAL_TRAVERSAL_STACK_CAPACITY = 64; private static final int LEFT_DONE = 0; private static final int EQ_DONE = 1; private static final int RIGHT_DONE = 2; private int[] posStack = new int[INITIAL_TRAVERSAL_STACK_CAPACITY]; private Node[] nodeStack = new Node[INITIAL_TRAVERSAL_STACK_CAPACITY]; private int sp; // stack pointer /** Next node to return **/ private Node current; /** traversal state of current node **/ private int traversalPosition; /** Last node returned by next(), to allow remove() **/ private Node lastReturned; /** fast-fail iterator support **/ private int expectedModCount = modCount; private void push(Node n, int cp) { if (sp >= nodeStack.length) { int oldlen = nodeStack.length; int newlen = oldlen * 2; Node[] ns = new Node[newlen]; int[] ps = new int[newlen]; for (int i = 0; i < oldlen; ++i) { ns[i] = nodeStack[i]; ps[i] = posStack[i]; } nodeStack = ns; posStack = ps; } nodeStack[sp] = n; posStack[sp] = cp; ++sp; } private void pop() { if (sp == 0) current = null; else { --sp; current = nodeStack[sp]; traversalPosition = posStack[sp]; } } /** * Traverse down to least element starting from p * pushing nodes along the way **/ private void descend(Node p) { for (;;) { if (p.left != null) { push(p, LEFT_DONE); p = p.left; } else if (p.eq != null) { push(p, EQ_DONE); p = p.eq; } else { current = p; traversalPosition = LEFT_DONE; break; } } } TernarySearchTrieSetIterator() { if (root != null) descend(root); } public boolean hasNext() { return current != null; } public Object next() { if (modCount != expectedModCount) throw new ConcurrentModificationException(); if (current == null) throw new NoSuchElementException(); lastReturned = current; Object value = current.string; for (;;) { if (traversalPosition == LEFT_DONE) { if (current.eq != null) { push(current, EQ_DONE); descend(current.eq); break; } else traversalPosition = EQ_DONE; } if (traversalPosition == EQ_DONE) { if (current.right != null) { descend(current.right); break; } else traversalPosition = RIGHT_DONE; } pop(); if (current == null || current.string != null) break; } return value; } public void remove() { if (lastReturned == null) throw new IllegalStateException(); if (modCount != expectedModCount) throw new ConcurrentModificationException(); // Tree modifications in del are written in way that cannot affect // stack, so the traversal can continue normally. TernarySearchTrieSet.this.remove(lastReturned.string); lastReturned = null; expectedModCount = modCount; } } /** * Save the state of this set instance to a stream (that is, * serialize this set). * * @serialData The number of elements, * followed by all of the elements. */ private void writeObject(java.io.ObjectOutputStream s) throws java.io.IOException { s.defaultWriteObject(); // Write out number of elements s.writeInt(count); // Write out elements for (Iterator it = iterator(); it.hasNext(); ) s.writeUTF((String)(it.next())); } /** * Reconstitute a set from a stream (that is, deserialize it). */ private void readObject(java.io.ObjectInputStream s) throws java.io.IOException, ClassNotFoundException { s.defaultReadObject(); // Read in size int size = s.readInt(); String[] items = new String[size]; for (int i = 0; i < size; ++i) items[i] = s.readUTF(); addFromSortedArray(items, 0, size-1); } /** * Returns a shallow copy of this set: the elements * themselves are not cloned. * * @return a shallow copy of this set. */ public Object clone() { TernarySearchTrieSet clone = null; try { clone = (TernarySearchTrieSet)super.clone(); } catch (CloneNotSupportedException e) { throw new InternalError(); } clone.root = null; clone.count = 0; clone.modCount = 0; // Spill out to array to avoid worst-case tree String[] items = new String[count]; int j = 0; for (Iterator it = iterator(); it.hasNext(); ) items[j++] = (String)(it.next()); clone.addFromSortedArray(items, 0, count-1); return clone; } private void addFromSortedArray(String[] items, int lo, int hi) { if (lo > hi) return; int mid = (lo + hi) / 2; add(items[mid]); addFromSortedArray(items, lo, mid-1); addFromSortedArray(items, mid+1, hi); } }