Skip to content
Snippets Groups Projects
Commit 07fd7d36 authored by Reynold Xin's avatar Reynold Xin
Browse files

[SPARK-9460] Avoid byte array allocation in StringPrefixComparator.

As of today, StringPrefixComparator converts the long values back to byte arrays in order to compare them. This patch optimizes this to compare the longs directly, rather than turning the longs into byte arrays and comparing them byte by byte (unsigned).

This only works on little-endian architecture right now.

Author: Reynold Xin <rxin@databricks.com>

Closes #7765 from rxin/SPARK-9460 and squashes the following commits:

e4908cc [Reynold Xin] Stricter randomized tests.
4c8d094 [Reynold Xin] [SPARK-9460] Avoid byte array allocation in StringPrefixComparator.
parent 9514d874
No related branches found
No related tags found
No related merge requests found
......@@ -17,9 +17,7 @@
package org.apache.spark.util.collection.unsafe.sort;
import com.google.common.base.Charsets;
import com.google.common.primitives.Longs;
import com.google.common.primitives.UnsignedBytes;
import com.google.common.primitives.UnsignedLongs;
import org.apache.spark.annotation.Private;
import org.apache.spark.unsafe.types.UTF8String;
......@@ -36,32 +34,11 @@ public class PrefixComparators {
public static final class StringPrefixComparator extends PrefixComparator {
@Override
public int compare(long aPrefix, long bPrefix) {
// TODO: can done more efficiently
byte[] a = Longs.toByteArray(aPrefix);
byte[] b = Longs.toByteArray(bPrefix);
for (int i = 0; i < 8; i++) {
int c = UnsignedBytes.compare(a[i], b[i]);
if (c != 0) return c;
}
return 0;
}
public long computePrefix(byte[] bytes) {
if (bytes == null) {
return 0L;
} else {
byte[] padded = new byte[8];
System.arraycopy(bytes, 0, padded, 0, Math.min(bytes.length, 8));
return Longs.fromByteArray(padded);
}
}
public long computePrefix(String value) {
return value == null ? 0L : computePrefix(value.getBytes(Charsets.UTF_8));
return UnsignedLongs.compare(aPrefix, bPrefix);
}
public long computePrefix(UTF8String value) {
return value == null ? 0L : computePrefix(value.getBytes());
return value == null ? 0L : value.getPrefix();
}
}
......
......@@ -17,22 +17,29 @@
package org.apache.spark.util.collection.unsafe.sort
import com.google.common.primitives.UnsignedBytes
import org.scalatest.prop.PropertyChecks
import org.apache.spark.SparkFunSuite
import org.apache.spark.unsafe.types.UTF8String
class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
test("String prefix comparator") {
def testPrefixComparison(s1: String, s2: String): Unit = {
val s1Prefix = PrefixComparators.STRING.computePrefix(s1)
val s2Prefix = PrefixComparators.STRING.computePrefix(s2)
val utf8string1 = UTF8String.fromString(s1)
val utf8string2 = UTF8String.fromString(s2)
val s1Prefix = PrefixComparators.STRING.computePrefix(utf8string1)
val s2Prefix = PrefixComparators.STRING.computePrefix(utf8string2)
val prefixComparisonResult = PrefixComparators.STRING.compare(s1Prefix, s2Prefix)
val cmp = UnsignedBytes.lexicographicalComparator().compare(
utf8string1.getBytes.take(8), utf8string2.getBytes.take(8))
assert(
(prefixComparisonResult == 0) ||
(prefixComparisonResult < 0 && s1 < s2) ||
(prefixComparisonResult > 0 && s1 > s2))
(prefixComparisonResult == 0 && cmp == 0) ||
(prefixComparisonResult < 0 && s1.compareTo(s2) < 0) ||
(prefixComparisonResult > 0 && s1.compareTo(s2) > 0))
}
// scalastyle:off
......
......@@ -137,6 +137,15 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
return len;
}
/**
* Returns a 64-bit integer that can be used as the prefix used in sorting.
*/
public long getPrefix() {
long p = PlatformDependent.UNSAFE.getLong(base, offset);
p = java.lang.Long.reverseBytes(p);
return p;
}
/**
* Returns the underline bytes, will be a copy of it if it's part of another array.
*/
......
......@@ -63,8 +63,19 @@ public class UTF8StringSuite {
assertEquals(0, EMPTY_UTF8.numBytes());
}
@Test
public void prefix() {
assertTrue(fromString("a").getPrefix() - fromString("b").getPrefix() < 0);
assertTrue(fromString("ab").getPrefix() - fromString("b").getPrefix() < 0);
assertTrue(
fromString("abbbbbbbbbbbasdf").getPrefix() - fromString("bbbbbbbbbbbbasdf").getPrefix() < 0);
assertTrue(fromString("").getPrefix() - fromString("a").getPrefix() < 0);
assertTrue(fromString("你好").getPrefix() - fromString("世界").getPrefix() > 0);
}
@Test
public void compareTo() {
assertTrue(fromString("").compareTo(fromString("a")) < 0);
assertTrue(fromString("abc").compareTo(fromString("ABC")) > 0);
assertTrue(fromString("abc0").compareTo(fromString("abc")) > 0);
assertTrue(fromString("abcabcabc").compareTo(fromString("abcabcabc")) == 0);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment