Skip to content
Snippets Groups Projects
Commit 7775d9f2 authored by Sandeep Singh's avatar Sandeep Singh Committed by Sean Owen
Browse files

[SPARK-17299] TRIM/LTRIM/RTRIM should not strips characters other than spaces

## What changes were proposed in this pull request?
TRIM/LTRIM/RTRIM should not strips characters other than spaces, we were trimming all chars small than ASCII 0x20(space)

## How was this patch tested?
fixed existing tests.

Author: Sandeep Singh <sandeep@techaddict.me>

Closes #14924 from techaddict/SPARK-17299.
parent 6c08dbf6
No related branches found
No related tags found
No related merge requests found
......@@ -465,9 +465,9 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
int s = 0;
int e = this.numBytes - 1;
// skip all of the space (0x20) in the left side
while (s < this.numBytes && getByte(s) <= 0x20 && getByte(s) >= 0x00) s++;
while (s < this.numBytes && getByte(s) == 0x20) s++;
// skip all of the space (0x20) in the right side
while (e >= 0 && getByte(e) <= 0x20 && getByte(e) >= 0x00) e--;
while (e >= 0 && getByte(e) == 0x20) e--;
if (s > e) {
// empty string
return EMPTY_UTF8;
......@@ -479,7 +479,7 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
public UTF8String trimLeft() {
int s = 0;
// skip all of the space (0x20) in the left side
while (s < this.numBytes && getByte(s) <= 0x20 && getByte(s) >= 0x00) s++;
while (s < this.numBytes && getByte(s) == 0x20) s++;
if (s == this.numBytes) {
// empty string
return EMPTY_UTF8;
......@@ -491,7 +491,7 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
public UTF8String trimRight() {
int e = numBytes - 1;
// skip all of the space (0x20) in the right side
while (e >= 0 && getByte(e) <= 0x20 && getByte(e) >= 0x00) e--;
while (e >= 0 && getByte(e) == 0x20) e--;
if (e < 0) {
// empty string
......
......@@ -232,6 +232,16 @@ public class UTF8StringSuite {
assertEquals(fromString("数据砖头"), fromString("数据砖头").trim());
assertEquals(fromString("数据砖头"), fromString("数据砖头").trimLeft());
assertEquals(fromString("数据砖头"), fromString("数据砖头").trimRight());
char[] charsLessThan0x20 = new char[10];
Arrays.fill(charsLessThan0x20, (char)(' ' - 1));
String stringStartingWithSpace =
new String(charsLessThan0x20) + "hello" + new String(charsLessThan0x20);
assertEquals(fromString(stringStartingWithSpace), fromString(stringStartingWithSpace).trim());
assertEquals(fromString(stringStartingWithSpace),
fromString(stringStartingWithSpace).trimLeft());
assertEquals(fromString(stringStartingWithSpace),
fromString(stringStartingWithSpace).trimRight());
}
@Test
......
......@@ -98,7 +98,7 @@ class UTF8StringPropertyCheckSuite extends FunSuite with GeneratorDrivenProperty
}
}
val whitespaceChar: Gen[Char] = Gen.choose(0x00, 0x20).map(_.toChar)
val whitespaceChar: Gen[Char] = Gen.const(0x20.toChar)
val whitespaceString: Gen[String] = Gen.listOf(whitespaceChar).map(_.mkString)
val randomString: Gen[String] = Arbitrary.arbString.arbitrary
......@@ -107,7 +107,7 @@ class UTF8StringPropertyCheckSuite extends FunSuite with GeneratorDrivenProperty
def lTrim(s: String): String = {
var st = 0
val array: Array[Char] = s.toCharArray
while ((st < s.length) && (array(st) <= ' ')) {
while ((st < s.length) && (array(st) == ' ')) {
st += 1
}
if (st > 0) s.substring(st, s.length) else s
......@@ -115,7 +115,7 @@ class UTF8StringPropertyCheckSuite extends FunSuite with GeneratorDrivenProperty
def rTrim(s: String): String = {
var len = s.length
val array: Array[Char] = s.toCharArray
while ((len > 0) && (array(len - 1) <= ' ')) {
while ((len > 0) && (array(len - 1) == ' ')) {
len -= 1
}
if (len < s.length) s.substring(0, len) else s
......@@ -127,7 +127,7 @@ class UTF8StringPropertyCheckSuite extends FunSuite with GeneratorDrivenProperty
whitespaceString
) { (start: String, middle: String, end: String) =>
val s = start + middle + end
assert(toUTF8(s).trim() === toUTF8(s.trim()))
assert(toUTF8(s).trim() === toUTF8(rTrim(lTrim(s))))
assert(toUTF8(s).trimLeft() === toUTF8(lTrim(s)))
assert(toUTF8(s).trimRight() === toUTF8(rTrim(s)))
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment