Skip to content
Snippets Groups Projects
Commit 6853ac7c authored by Tarek Auel's avatar Tarek Auel Committed by Reynold Xin
Browse files

[SPARK-9156][SQL] codegen StringSplit

Jira: https://issues.apache.org/jira/browse/SPARK-9156

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7547 from tarekauel/SPARK-9156 and squashes the following commits:

0be2700 [Tarek Auel] [SPARK-9156][SQL] indention fix
b860eaf [Tarek Auel] [SPARK-9156][SQL] codegen StringSplit
5ad6a1f [Tarek Auel] [SPARK-9156] codegen StringSplit
parent 047ccc8c
No related branches found
No related tags found
No related merge requests found
...@@ -615,7 +615,7 @@ case class StringSpace(child: Expression) ...@@ -615,7 +615,7 @@ case class StringSpace(child: Expression)
* Splits str around pat (pattern is a regular expression). * Splits str around pat (pattern is a regular expression).
*/ */
case class StringSplit(str: Expression, pattern: Expression) case class StringSplit(str: Expression, pattern: Expression)
extends BinaryExpression with ImplicitCastInputTypes with CodegenFallback { extends BinaryExpression with ImplicitCastInputTypes {
override def left: Expression = str override def left: Expression = str
override def right: Expression = pattern override def right: Expression = pattern
...@@ -623,9 +623,13 @@ case class StringSplit(str: Expression, pattern: Expression) ...@@ -623,9 +623,13 @@ case class StringSplit(str: Expression, pattern: Expression)
override def inputTypes: Seq[DataType] = Seq(StringType, StringType) override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
override def nullSafeEval(string: Any, regex: Any): Any = { override def nullSafeEval(string: Any, regex: Any): Any = {
val splits = string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1).toSeq
string.asInstanceOf[UTF8String].toString.split(regex.asInstanceOf[UTF8String].toString, -1) }
splits.toSeq.map(UTF8String.fromString)
override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
nullSafeCodeGen(ctx, ev, (str, pattern) =>
s"""${ev.primitive} = scala.collection.JavaConversions.asScalaBuffer(
java.util.Arrays.asList($str.split($pattern, -1)));""")
} }
override def prettyName: String = "split" override def prettyName: String = "split"
......
...@@ -487,6 +487,15 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable { ...@@ -487,6 +487,15 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
return fromBytes(result); return fromBytes(result);
} }
public UTF8String[] split(UTF8String pattern, int limit) {
String[] splits = toString().split(pattern.toString(), limit);
UTF8String[] res = new UTF8String[splits.length];
for (int i = 0; i < res.length; i++) {
res[i] = fromString(splits[i]);
}
return res;
}
@Override @Override
public String toString() { public String toString() {
try { try {
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
package org.apache.spark.unsafe.types; package org.apache.spark.unsafe.types;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import org.junit.Test; import org.junit.Test;
...@@ -270,6 +271,16 @@ public class UTF8StringSuite { ...@@ -270,6 +271,16 @@ public class UTF8StringSuite {
fromString("数据砖头孙行者孙行者孙行"), fromString("数据砖头孙行者孙行者孙行"),
fromString("数据砖头").rpad(12, fromString("孙行者"))); fromString("数据砖头").rpad(12, fromString("孙行者")));
} }
@Test
public void split() {
assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), -1),
new UTF8String[]{fromString("ab"), fromString("def"), fromString("ghi")}));
assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), 2),
new UTF8String[]{fromString("ab"), fromString("def,ghi")}));
assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), 2),
new UTF8String[]{fromString("ab"), fromString("def,ghi")}));
}
@Test @Test
public void levenshteinDistance() { public void levenshteinDistance() {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment