Skip to content
Snippets Groups Projects
Commit 466fd064 authored by Reynold Xin's avatar Reynold Xin
Browse files

Incorporated ideas from pull request #200.

- Use Murmur Hash 3 finalization step to scramble the bits of HashCode
  instead of the simpler version in java.util.HashMap; the latter one
  had trouble with ranges of consecutive integers. Murmur Hash 3 is used
  by fastutil.

- Don't check keys for equality when re-inserting due to growing the
  table; the keys will already be unique

- Remember the grow threshold instead of recomputing it on each insert
parent 95c55df1
No related branches found
No related tags found
No related merge requests found
...@@ -79,6 +79,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassManifest]( ...@@ -79,6 +79,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassManifest](
protected var _capacity = nextPowerOf2(initialCapacity) protected var _capacity = nextPowerOf2(initialCapacity)
protected var _mask = _capacity - 1 protected var _mask = _capacity - 1
protected var _size = 0 protected var _size = 0
protected var _growThreshold = (loadFactor * _capacity).toInt
protected var _bitset = new BitSet(_capacity) protected var _bitset = new BitSet(_capacity)
...@@ -115,7 +116,29 @@ class OpenHashSet[@specialized(Long, Int) T: ClassManifest]( ...@@ -115,7 +116,29 @@ class OpenHashSet[@specialized(Long, Int) T: ClassManifest](
* @return The position where the key is placed, plus the highest order bit is set if the key * @return The position where the key is placed, plus the highest order bit is set if the key
* exists previously. * exists previously.
*/ */
def addWithoutResize(k: T): Int = putInto(_bitset, _data, k) def addWithoutResize(k: T): Int = {
var pos = hashcode(hasher.hash(k)) & _mask
var i = 1
while (true) {
if (!_bitset.get(pos)) {
// This is a new key.
_data(pos) = k
_bitset.set(pos)
_size += 1
return pos | NONEXISTENCE_MASK
} else if (_data(pos) == k) {
// Found an existing key.
return pos
} else {
val delta = i
pos = (pos + delta) & _mask
i += 1
}
}
// Never reached here
assert(INVALID_POS != INVALID_POS)
INVALID_POS
}
/** /**
* Rehash the set if it is overloaded. * Rehash the set if it is overloaded.
...@@ -126,7 +149,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassManifest]( ...@@ -126,7 +149,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassManifest](
* to a new position (in the new data array). * to a new position (in the new data array).
*/ */
def rehashIfNeeded(k: T, allocateFunc: (Int) => Unit, moveFunc: (Int, Int) => Unit) { def rehashIfNeeded(k: T, allocateFunc: (Int) => Unit, moveFunc: (Int, Int) => Unit) {
if (_size > loadFactor * _capacity) { if (_size > _growThreshold) {
rehash(k, allocateFunc, moveFunc) rehash(k, allocateFunc, moveFunc)
} }
} }
...@@ -160,37 +183,6 @@ class OpenHashSet[@specialized(Long, Int) T: ClassManifest]( ...@@ -160,37 +183,6 @@ class OpenHashSet[@specialized(Long, Int) T: ClassManifest](
*/ */
def nextPos(fromPos: Int): Int = _bitset.nextSetBit(fromPos) def nextPos(fromPos: Int): Int = _bitset.nextSetBit(fromPos)
/**
* Put an entry into the set. Return the position where the key is placed. In addition, the
* highest bit in the returned position is set if the key exists prior to this put.
*
* This function assumes the data array has at least one empty slot.
*/
private def putInto(bitset: BitSet, data: Array[T], k: T): Int = {
val mask = data.length - 1
var pos = hashcode(hasher.hash(k)) & mask
var i = 1
while (true) {
if (!bitset.get(pos)) {
// This is a new key.
data(pos) = k
bitset.set(pos)
_size += 1
return pos | NONEXISTENCE_MASK
} else if (data(pos) == k) {
// Found an existing key.
return pos
} else {
val delta = i
pos = (pos + delta) & mask
i += 1
}
}
// Never reached here
assert(INVALID_POS != INVALID_POS)
INVALID_POS
}
/** /**
* Double the table's size and re-hash everything. We are not really using k, but it is declared * Double the table's size and re-hash everything. We are not really using k, but it is declared
* so Scala compiler can specialize this method (which leads to calling the specialized version * so Scala compiler can specialize this method (which leads to calling the specialized version
...@@ -204,34 +196,49 @@ class OpenHashSet[@specialized(Long, Int) T: ClassManifest]( ...@@ -204,34 +196,49 @@ class OpenHashSet[@specialized(Long, Int) T: ClassManifest](
*/ */
private def rehash(k: T, allocateFunc: (Int) => Unit, moveFunc: (Int, Int) => Unit) { private def rehash(k: T, allocateFunc: (Int) => Unit, moveFunc: (Int, Int) => Unit) {
val newCapacity = _capacity * 2 val newCapacity = _capacity * 2
require(newCapacity <= (1 << 29), "Can't make capacity bigger than 2^29 elements")
allocateFunc(newCapacity) allocateFunc(newCapacity)
val newData = new Array[T](newCapacity)
val newBitset = new BitSet(newCapacity) val newBitset = new BitSet(newCapacity)
var pos = 0 val newData = new Array[T](newCapacity)
_size = 0 val newMask = newCapacity - 1
while (pos < _capacity) {
if (_bitset.get(pos)) { var oldPos = 0
val newPos = putInto(newBitset, newData, _data(pos)) while (oldPos < capacity) {
moveFunc(pos, newPos & POSITION_MASK) if (_bitset.get(oldPos)) {
val key = _data(oldPos)
var newPos = hashcode(hasher.hash(key)) & newMask
var i = 1
var keepGoing = true
// No need to check for equality here when we insert so this has one less if branch than
// the similar code path in addWithoutResize.
while (keepGoing) {
if (!newBitset.get(newPos)) {
// Inserting the key at newPos
newData(newPos) = key
newBitset.set(newPos)
moveFunc(oldPos, newPos)
keepGoing = false
} else {
val delta = i
newPos = (newPos + delta) & newMask
i += 1
}
}
} }
pos += 1 oldPos += 1
} }
_bitset = newBitset _bitset = newBitset
_data = newData _data = newData
_capacity = newCapacity _capacity = newCapacity
_mask = newCapacity - 1 _mask = newMask
_growThreshold = (loadFactor * newCapacity).toInt
} }
/** /**
* Re-hash a value to deal better with hash functions that don't differ * Re-hash a value to deal better with hash functions that don't differ in the lower bits.
* in the lower bits, similar to java.util.HashMap * We use the Murmur Hash 3 finalization step that's also used in fastutil.
*/ */
private def hashcode(h: Int): Int = { private def hashcode(h: Int): Int = it.unimi.dsi.fastutil.HashCommon.murmurHash3(h)
val r = h ^ (h >>> 20) ^ (h >>> 12)
r ^ (r >>> 7) ^ (r >>> 4)
}
private def nextPowerOf2(n: Int): Int = { private def nextPowerOf2(n: Int): Int = {
val highBit = Integer.highestOneBit(n) val highBit = Integer.highestOneBit(n)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment