[Git][java-team/airlift-slice][upstream] New upstream version 0.16
Andreas Tille (@tille)
gitlab at salsa.debian.org
Thu Jan 16 15:13:23 GMT 2025
Andreas Tille pushed to branch upstream at Debian Java Maintainers / airlift-slice
Commits:
0b121495 by Andreas Tille at 2025-01-16T15:57:57+01:00
New upstream version 0.16
- - - - -
12 changed files:
- .travis.yml
- pom.xml
- + src/main/java/io/airlift/slice/InvalidCodePointException.java
- + src/main/java/io/airlift/slice/InvalidUtf8Exception.java
- src/main/java/io/airlift/slice/Slice.java
- + src/main/java/io/airlift/slice/SliceUtf8.java
- src/main/java/io/airlift/slice/Slices.java
- src/main/java/io/airlift/slice/UnsafeSliceFactory.java
- src/main/java/io/airlift/slice/XxHash64.java
- + src/test/java/io/airlift/slice/SliceUtf8Benchmark.java
- src/test/java/io/airlift/slice/TestSlice.java
- + src/test/java/io/airlift/slice/TestSliceUtf8.java
Changes:
=====================================
.travis.yml
=====================================
@@ -1 +1,4 @@
language: java
+
+jdk:
+ - oraclejdk8
=====================================
pom.xml
=====================================
@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<artifactId>slice</artifactId>
- <version>0.10</version>
+ <version>0.16</version>
<packaging>jar</packaging>
<name>slice</name>
@@ -13,7 +13,7 @@
<parent>
<groupId>io.airlift</groupId>
<artifactId>airbase</artifactId>
- <version>31</version>
+ <version>38</version>
</parent>
<inceptionYear>2012</inceptionYear>
@@ -21,6 +21,7 @@
<properties>
<air.check.skip-extended>true</air.check.skip-extended>
<air.check.skip-license>false</air.check.skip-license>
+ <air.javadoc.lint>-missing</air.javadoc.lint>
<dep.jmh.version>0.9.4</dep.jmh.version>
</properties>
@@ -78,6 +79,6 @@
</build>
<scm>
- <tag>0.10</tag>
+ <tag>0.16</tag>
</scm>
</project>
=====================================
src/main/java/io/airlift/slice/InvalidCodePointException.java
=====================================
@@ -0,0 +1,33 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.airlift.slice;
+
+import static java.lang.Integer.toHexString;
+
+public class InvalidCodePointException
+ extends IllegalArgumentException
+{
+ private final int codePoint;
+
+ public InvalidCodePointException(int codePoint)
+ {
+ super("Invalid code point 0x" + toHexString(codePoint).toUpperCase());
+ this.codePoint = codePoint;
+ }
+
+ public int getCodePoint()
+ {
+ return codePoint;
+ }
+}
=====================================
src/main/java/io/airlift/slice/InvalidUtf8Exception.java
=====================================
@@ -0,0 +1,23 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.airlift.slice;
+
+public class InvalidUtf8Exception
+ extends IllegalArgumentException
+{
+ public InvalidUtf8Exception(String message)
+ {
+ super(message);
+ }
+}
=====================================
src/main/java/io/airlift/slice/Slice.java
=====================================
@@ -35,6 +35,7 @@ import static io.airlift.slice.SizeOf.SIZE_OF_INT;
import static io.airlift.slice.SizeOf.SIZE_OF_LONG;
import static io.airlift.slice.SizeOf.SIZE_OF_SHORT;
import static io.airlift.slice.StringDecoder.decodeString;
+import static java.lang.Math.min;
import static java.lang.String.format;
import static java.nio.charset.StandardCharsets.UTF_8;
import static sun.misc.Unsafe.ARRAY_BOOLEAN_BASE_OFFSET;
@@ -74,7 +75,7 @@ public final class Slice
* this slice; otherwise, address is the offset from the base object.
* This base plus relative offset addressing is taken directly from
* the Unsafe interface.
- * <p/>
+ * <p>
* Note: if base object is a byte array, this address ARRAY_BYTE_BASE_OFFSET,
* since the byte array data starts AFTER the byte array object header.
*/
@@ -85,6 +86,11 @@ public final class Slice
*/
private final int size;
+ /**
+ * Bytes retained by the slice
+ */
+ private final int retainedSize;
+
/**
* Reference is typically a ByteBuffer object, but can be any object this
* slice must hold onto to assure that the underlying memory is not
@@ -102,6 +108,7 @@ public final class Slice
this.base = null;
this.address = 0;
this.size = 0;
+ this.retainedSize = 0;
this.reference = null;
}
@@ -114,6 +121,7 @@ public final class Slice
this.base = base;
this.address = ARRAY_BYTE_BASE_OFFSET;
this.size = base.length;
+ this.retainedSize = base.length;
this.reference = null;
}
@@ -128,6 +136,7 @@ public final class Slice
this.base = base;
this.address = ARRAY_BYTE_BASE_OFFSET + offset;
this.size = length;
+ this.retainedSize = base.length;
this.reference = null;
}
@@ -142,6 +151,7 @@ public final class Slice
this.base = base;
this.address = ARRAY_BOOLEAN_BASE_OFFSET + offset;
this.size = length * ARRAY_BOOLEAN_INDEX_SCALE;
+ this.retainedSize = base.length * ARRAY_BOOLEAN_INDEX_SCALE;
this.reference = null;
}
@@ -156,6 +166,7 @@ public final class Slice
this.base = base;
this.address = ARRAY_SHORT_BASE_OFFSET + offset;
this.size = length * ARRAY_SHORT_INDEX_SCALE;
+ this.retainedSize = base.length * ARRAY_SHORT_INDEX_SCALE;
this.reference = null;
}
@@ -170,6 +181,7 @@ public final class Slice
this.base = base;
this.address = ARRAY_INT_BASE_OFFSET + offset;
this.size = length * ARRAY_INT_INDEX_SCALE;
+ this.retainedSize = base.length * ARRAY_INT_INDEX_SCALE;
this.reference = null;
}
@@ -184,6 +196,7 @@ public final class Slice
this.base = base;
this.address = ARRAY_LONG_BASE_OFFSET + offset;
this.size = length * ARRAY_LONG_INDEX_SCALE;
+ this.retainedSize = base.length * ARRAY_LONG_INDEX_SCALE;
this.reference = null;
}
@@ -198,6 +211,7 @@ public final class Slice
this.base = base;
this.address = ARRAY_FLOAT_BASE_OFFSET + offset;
this.size = length * ARRAY_FLOAT_INDEX_SCALE;
+ this.retainedSize = base.length * ARRAY_FLOAT_INDEX_SCALE;
this.reference = null;
}
@@ -212,13 +226,14 @@ public final class Slice
this.base = base;
this.address = ARRAY_DOUBLE_BASE_OFFSET + offset;
this.size = length * ARRAY_DOUBLE_INDEX_SCALE;
+ this.retainedSize = base.length * ARRAY_DOUBLE_INDEX_SCALE;
this.reference = null;
}
/**
* Creates a slice for directly accessing the base object.
*/
- Slice(@Nullable Object base, long address, int size, @Nullable Object reference)
+ Slice(@Nullable Object base, long address, int size, int retainedSize, @Nullable Object reference)
{
if (address <= 0) {
throw new IllegalArgumentException(format("Invalid address: %s", address));
@@ -232,6 +247,7 @@ public final class Slice
this.base = base;
this.address = address;
this.size = size;
+ this.retainedSize = retainedSize;
}
/**
@@ -260,6 +276,14 @@ public final class Slice
return size;
}
+ /**
+ * Approximate number of bytes retained by this slice.
+ */
+ public int getRetainedSize()
+ {
+ return retainedSize;
+ }
+
/**
* Fill the slice with the specified value;
*/
@@ -313,6 +337,11 @@ public final class Slice
public byte getByte(int index)
{
checkIndexLength(index, SIZE_OF_BYTE);
+ return getByteUnchecked(index);
+ }
+
+ byte getByteUnchecked(int index)
+ {
return unsafe.getByte(base, address + index);
}
@@ -338,6 +367,11 @@ public final class Slice
public short getShort(int index)
{
checkIndexLength(index, SIZE_OF_SHORT);
+ return getShortUnchecked(index);
+ }
+
+ short getShortUnchecked(int index)
+ {
return unsafe.getShort(base, address + index);
}
@@ -351,6 +385,11 @@ public final class Slice
public int getInt(int index)
{
checkIndexLength(index, SIZE_OF_INT);
+ return getIntUnchecked(index);
+ }
+
+ public int getIntUnchecked(int index)
+ {
return unsafe.getInt(base, address + index);
}
@@ -364,6 +403,11 @@ public final class Slice
public long getLong(int index)
{
checkIndexLength(index, SIZE_OF_LONG);
+ return getLongUnchecked(index);
+ }
+
+ long getLongUnchecked(int index)
+ {
return unsafe.getLong(base, address + index);
}
@@ -494,9 +538,14 @@ public final class Slice
{
checkIndexLength(index, length);
+ if (base instanceof byte[]) {
+ out.write((byte[]) base, (int) ((address - ARRAY_BYTE_BASE_OFFSET) + index), length);
+ return;
+ }
+
byte[] buffer = new byte[4096];
while (length > 0) {
- int size = Math.min(buffer.length, length);
+ int size = min(buffer.length, length);
getBytes(index, buffer, 0, size);
out.write(buffer, 0, size);
length -= size;
@@ -514,6 +563,11 @@ public final class Slice
public void setByte(int index, int value)
{
checkIndexLength(index, SIZE_OF_BYTE);
+ setByteUnchecked(index, value);
+ }
+
+ void setByteUnchecked(int index, int value)
+ {
unsafe.putByte(base, address + index, (byte) (value & 0xFF));
}
@@ -528,6 +582,11 @@ public final class Slice
public void setShort(int index, int value)
{
checkIndexLength(index, SIZE_OF_SHORT);
+ setShortUnchecked(index, value);
+ }
+
+ void setShortUnchecked(int index, int value)
+ {
unsafe.putShort(base, address + index, (short) (value & 0xFFFF));
}
@@ -541,6 +600,11 @@ public final class Slice
public void setInt(int index, int value)
{
checkIndexLength(index, SIZE_OF_INT);
+ setIntUnchecked(index, value);
+ }
+
+ void setIntUnchecked(int index, int value)
+ {
unsafe.putInt(base, address + index, value);
}
@@ -658,7 +722,7 @@ public final class Slice
byte[] bytes = new byte[4096];
while (length > 0) {
- int bytesRead = in.read(bytes, 0, Math.min(bytes.length, length));
+ int bytesRead = in.read(bytes, 0, min(bytes.length, length));
if (bytesRead < 0) {
throw new IndexOutOfBoundsException("End of stream");
}
@@ -681,20 +745,119 @@ public final class Slice
if (length == 0) {
return Slices.EMPTY_SLICE;
}
- return new Slice(base, address + index, length, reference);
+ return new Slice(base, address + index, length, retainedSize, reference);
}
public int indexOfByte(int b)
{
b = b & 0xFF;
for (int i = 0; i < size; i++) {
- if (unsafe.getByte(base, address + i) == b) {
+ if (getByteUnchecked(i) == b) {
return i;
}
}
return -1;
}
+ /**
+ * Returns the index of the first occurrence of the pattern with this slice.
+ * If the pattern is not found -1 is returned. If patten is empty, zero is
+ * returned.
+ */
+ public int indexOf(Slice slice)
+ {
+ return indexOf(slice, 0);
+ }
+
+ /**
+ * Returns the index of the first occurrence of the pattern with this slice.
+ * If the pattern is not found -1 is returned If patten is empty, the offset
+ * is returned.
+ */
+ public int indexOf(Slice pattern, int offset)
+ {
+ if (size == 0 || offset >= size) {
+ return -1;
+ }
+
+ if (pattern.length() == 0) {
+ return offset;
+ }
+
+ // Do we have enough characters
+ if (pattern.length() < SIZE_OF_INT || size < SIZE_OF_LONG) {
+ return indexOfBruteForce(pattern, offset);
+ }
+
+ // Using first four bytes for faster search. We are not using eight bytes for long
+ // because we want more strings to get use of fast search.
+ int head = pattern.getIntUnchecked(0);
+
+ // Take the first byte of head for faster skipping
+ int firstByteMask = head & 0xff;
+ firstByteMask |= firstByteMask << 8;
+ firstByteMask |= firstByteMask << 16;
+
+ int lastValidIndex = size - pattern.length();
+ int index = offset;
+ while (index <= lastValidIndex) {
+ // Read four bytes in sequence
+ int value = getIntUnchecked(index);
+
+ // Compare all bytes of value with first byte of search data
+ // see https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+ int valueXor = value ^ firstByteMask;
+ int hasZeroBytes = (valueXor - 0x01010101) & ~valueXor & 0x80808080;
+
+ // If valueXor doesn't not have any zero byte then there is no match and we can advance
+ if (hasZeroBytes == 0) {
+ index += SIZE_OF_INT;
+ continue;
+ }
+
+ // Try fast match of head and the rest
+ if (value == head && equalsUnchecked(index, pattern, 0, pattern.length())) {
+ return index;
+ }
+
+ index++;
+ }
+
+ return -1;
+ }
+
+ int indexOfBruteForce(Slice pattern, int offset)
+ {
+ if (size == 0 || offset >= size) {
+ return -1;
+ }
+
+ if (pattern.length() == 0) {
+ return offset;
+ }
+
+ byte firstByte = pattern.getByteUnchecked(0);
+ int lastValidIndex = size - pattern.length();
+ int index = offset;
+ while (true) {
+ // seek to first byte match
+ while (index < lastValidIndex && getByteUnchecked(index) != firstByte) {
+ index++;
+ }
+ if (index > lastValidIndex) {
+ break;
+ }
+
+ if (equalsUnchecked(index, pattern, 0, pattern.length())) {
+ return index;
+ }
+
+ index++;
+ }
+
+ return -1;
+ }
+
/**
* Compares the content of the specified buffer to the content of this
* buffer. This comparison is performed byte by byte using an unsigned
@@ -724,11 +887,11 @@ public final class Slice
checkIndexLength(offset, length);
that.checkIndexLength(otherOffset, otherLength);
- int compareLength = Math.min(length, otherLength);
+ int compareLength = min(length, otherLength);
while (compareLength >= SIZE_OF_LONG) {
- long thisLong = unsafe.getLong(base, address + offset);
+ long thisLong = getLongUnchecked(offset);
thisLong = Long.reverseBytes(thisLong);
- long thatLong = unsafe.getLong(that.base, that.address + otherOffset);
+ long thatLong = that.getLongUnchecked(otherOffset);
thatLong = Long.reverseBytes(thatLong);
int v = compareUnsignedLongs(thisLong, thatLong);
@@ -742,8 +905,8 @@ public final class Slice
}
while (compareLength > 0) {
- byte thisByte = unsafe.getByte(base, address + offset);
- byte thatByte = unsafe.getByte(that.base, that.address + otherOffset);
+ byte thisByte = getByteUnchecked(offset);
+ byte thatByte = that.getByteUnchecked(otherOffset);
int v = compareUnsignedBytes(thisByte, thatByte);
if (v != 0) {
@@ -779,8 +942,8 @@ public final class Slice
int offset = 0;
int length = size;
while (length >= SIZE_OF_LONG) {
- long thisLong = unsafe.getLong(base, address + offset);
- long thatLong = unsafe.getLong(that.base, that.address + offset);
+ long thisLong = getLongUnchecked(offset);
+ long thatLong = that.getLongUnchecked(offset);
if (thisLong != thatLong) {
return false;
@@ -791,8 +954,8 @@ public final class Slice
}
while (length > 0) {
- byte thisByte = unsafe.getByte(base, address + offset);
- byte thatByte = unsafe.getByte(that.base, that.address + offset);
+ byte thisByte = getByteUnchecked(offset);
+ byte thatByte = that.getByteUnchecked(offset);
if (thisByte != thatByte) {
return false;
}
@@ -838,16 +1001,21 @@ public final class Slice
return false;
}
+ checkIndexLength(offset, length);
+ that.checkIndexLength(otherOffset, otherLength);
+
+ return equalsUnchecked(offset, that, otherOffset, length);
+ }
+
+ boolean equalsUnchecked(int offset, Slice that, int otherOffset, int length)
+ {
if ((this == that) && (offset == otherOffset)) {
return true;
}
- checkIndexLength(offset, length);
- that.checkIndexLength(otherOffset, otherLength);
-
while (length >= SIZE_OF_LONG) {
- long thisLong = unsafe.getLong(base, address + offset);
- long thatLong = unsafe.getLong(that.base, that.address + otherOffset);
+ long thisLong = getLongUnchecked(offset);
+ long thatLong = that.getLongUnchecked(otherOffset);
if (thisLong != thatLong) {
return false;
@@ -859,8 +1027,8 @@ public final class Slice
}
while (length > 0) {
- byte thisByte = unsafe.getByte(base, address + offset);
- byte thatByte = unsafe.getByte(that.base, that.address + otherOffset);
+ byte thisByte = getByteUnchecked(offset);
+ byte thatByte = that.getByteUnchecked(otherOffset);
if (thisByte != thatByte) {
return false;
}
@@ -932,7 +1100,7 @@ public final class Slice
char[] chars = new char[length];
for (int pos = index; pos < length; pos++) {
- chars[pos] = (char) (unsafe.getByte(base, address + pos) & 0x7F);
+ chars[pos] = (char) (getByteUnchecked(pos) & 0x7F);
}
return new String(chars);
}
@@ -1029,13 +1197,13 @@ public final class Slice
private static long fillLong(byte value)
{
return (value & 0xFFL) << 56
- | (value & 0xFFL) << 48
- | (value & 0xFFL) << 40
- | (value & 0xFFL) << 32
- | (value & 0xFFL) << 24
- | (value & 0xFFL) << 16
- | (value & 0xFFL) << 8
- | (value & 0xFFL);
+ | (value & 0xFFL) << 48
+ | (value & 0xFFL) << 40
+ | (value & 0xFFL) << 32
+ | (value & 0xFFL) << 24
+ | (value & 0xFFL) << 16
+ | (value & 0xFFL) << 8
+ | (value & 0xFFL);
}
private static int compareUnsignedBytes(byte thisByte, byte thatByte)
=====================================
src/main/java/io/airlift/slice/SliceUtf8.java
=====================================
@@ -0,0 +1,942 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.airlift.slice;
+
+import java.util.OptionalInt;
+
+import static io.airlift.slice.Preconditions.checkArgument;
+import static io.airlift.slice.Preconditions.checkPositionIndex;
+import static io.airlift.slice.Preconditions.checkPositionIndexes;
+import static java.lang.Character.MAX_CODE_POINT;
+import static java.lang.Character.MAX_SURROGATE;
+import static java.lang.Character.MIN_SURROGATE;
+import static java.lang.Integer.toHexString;
+
+/**
+ * Utility methods for UTF-8 encoded slices.
+ */
+public final class SliceUtf8
+{
+ private SliceUtf8() {}
+
+ private static final int REPLACEMENT_CODE_POINT = 0xFFFD;
+
+ private static final int TOP_MASK32 = 0x8080_8080;
+ private static final long TOP_MASK64 = 0x8080_8080_8080_8080L;
+
+ private static final int[] LOWER_CODE_POINTS;
+ private static final int[] UPPER_CODE_POINTS;
+ private static final boolean[] WHITESPACE_CODE_POINTS;
+
+ static {
+ LOWER_CODE_POINTS = new int[MAX_CODE_POINT + 1];
+ UPPER_CODE_POINTS = new int[MAX_CODE_POINT + 1];
+ WHITESPACE_CODE_POINTS = new boolean[MAX_CODE_POINT + 1];
+ for (int codePoint = 0; codePoint <= MAX_CODE_POINT; codePoint++) {
+ int type = Character.getType(codePoint);
+ if (type != Character.SURROGATE) {
+ LOWER_CODE_POINTS[codePoint] = Character.toLowerCase(codePoint);
+ UPPER_CODE_POINTS[codePoint] = Character.toUpperCase(codePoint);
+ WHITESPACE_CODE_POINTS[codePoint] = Character.isWhitespace(codePoint);
+ }
+ else {
+ LOWER_CODE_POINTS[codePoint] = REPLACEMENT_CODE_POINT;
+ UPPER_CODE_POINTS[codePoint] = REPLACEMENT_CODE_POINT;
+ WHITESPACE_CODE_POINTS[codePoint] = false;
+ }
+ }
+ }
+
+ /**
+ * Does the slice contain only 7-bit ASCII characters.
+ */
+ public static boolean isAscii(Slice utf8)
+ {
+ int length = utf8.length();
+ int offset = 0;
+
+ // Length rounded to 8 bytes
+ int length8 = length & 0x7FFF_FFF8;
+ for (; offset < length8; offset += 8) {
+ if ((utf8.getLongUnchecked(offset) & TOP_MASK64) != 0) {
+ return false;
+ }
+ }
+ // Enough bytes left for 32 bits?
+ if (offset + 4 < length) {
+ if ((utf8.getIntUnchecked(offset) & TOP_MASK32) != 0) {
+ return false;
+ }
+
+ offset += 4;
+ }
+ // Do the rest one by one
+ for (; offset < length; offset++) {
+ if ((utf8.getByteUnchecked(offset) & 0x80) != 0) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ * Counts the code points within UTF-8 encoded slice.
+ * <p>
+ * Note: This method does not explicitly check for valid UTF-8, and may
+ * return incorrect results or throw an exception for invalid UTF-8.
+ */
+ public static int countCodePoints(Slice utf8)
+ {
+ return countCodePoints(utf8, 0, utf8.length());
+ }
+
+ /**
+ * Counts the code points within UTF-8 encoded slice up to {@code length}.
+ * <p>
+ * Note: This method does not explicitly check for valid UTF-8, and may
+ * return incorrect results or throw an exception for invalid UTF-8.
+ */
+ public static int countCodePoints(Slice utf8, int offset, int length)
+ {
+ checkPositionIndexes(offset, offset + length, utf8.length());
+
+ // Quick exit if empty string
+ if (length == 0) {
+ return 0;
+ }
+
+ int continuationBytesCount = 0;
+ // Length rounded to 8 bytes
+ int length8 = length & 0x7FFF_FFF8;
+ for (; offset < length8; offset += 8) {
+ // Count bytes which are NOT the start of a code point
+ continuationBytesCount += countContinuationBytes(utf8.getLongUnchecked(offset));
+ }
+ // Enough bytes left for 32 bits?
+ if (offset + 4 < length) {
+ // Count bytes which are NOT the start of a code point
+ continuationBytesCount += countContinuationBytes(utf8.getIntUnchecked(offset));
+
+ offset += 4;
+ }
+ // Do the rest one by one
+ for (; offset < length; offset++) {
+ // Count bytes which are NOT the start of a code point
+ continuationBytesCount += countContinuationBytes(utf8.getByteUnchecked(offset));
+ }
+
+ assert continuationBytesCount <= length;
+ return length - continuationBytesCount;
+ }
+
+ /**
+ * Gets the substring starting at {@code codePointStart} and extending for
+ * {@code codePointLength} code points.
+ * <p>
+ * Note: This method does not explicitly check for valid UTF-8, and may
+ * return incorrect results or throw an exception for invalid UTF-8.
+ */
+ public static Slice substring(Slice utf8, int codePointStart, int codePointLength)
+ {
+ checkArgument(codePointStart >= 0, "codePointStart is negative");
+ checkArgument(codePointLength >= 0, "codePointLength is negative");
+
+ int indexStart = offsetOfCodePoint(utf8, codePointStart);
+ if (indexStart < 0) {
+ throw new IllegalArgumentException("UTF-8 does not contain " + codePointStart + " code points");
+ }
+ if (codePointLength == 0) {
+ return Slices.EMPTY_SLICE;
+ }
+ int indexEnd = offsetOfCodePoint(utf8, indexStart, codePointLength - 1);
+ if (indexEnd < 0) {
+ throw new IllegalArgumentException("UTF-8 does not contain " + (codePointStart + codePointLength) + " code points");
+ }
+ indexEnd += lengthOfCodePoint(utf8, indexEnd);
+ if (indexEnd > utf8.length()) {
+ throw new InvalidUtf8Exception("UTF-8 is not well formed");
+ }
+ return utf8.slice(indexStart, indexEnd - indexStart);
+ }
+
+ /**
+ * Reverses the slice code point by code point.
+ * <p>
+ * Note: Invalid UTF-8 sequences are copied directly to the output.
+ */
+ public static Slice reverse(Slice utf8)
+ {
+ int length = utf8.length();
+ Slice reverse = Slices.allocate(length);
+
+ int forwardPosition = 0;
+ int reversePosition = length;
+ while (forwardPosition < length) {
+ int codePointLength = lengthOfCodePointSafe(utf8, forwardPosition);
+
+ // backup the reverse pointer
+ reversePosition -= codePointLength;
+ if (reversePosition < 0) {
+ // this should not happen
+ throw new InvalidUtf8Exception("UTF-8 is not well formed");
+ }
+ // copy the character
+ copyUtf8SequenceUnsafe(utf8, forwardPosition, reverse, reversePosition, codePointLength);
+
+ forwardPosition += codePointLength;
+ }
+ return reverse;
+ }
+
+ /**
+ * Converts slice to upper case code point by code point. This method does
+ * not perform perform locale-sensitive, context-sensitive, or one-to-many
+ * mappings required for some languages. Specifically, this will return
+ * incorrect results for Lithuanian, Turkish, and Azeri.
+ * <p>
+ * Note: Invalid UTF-8 sequences are copied directly to the output.
+ */
+ public static Slice toUpperCase(Slice utf8)
+ {
+ return translateCodePoints(utf8, UPPER_CODE_POINTS);
+ }
+
+ /**
+ * Converts slice to lower case code point by code point. This method does
+ * not perform perform locale-sensitive, context-sensitive, or one-to-many
+ * mappings required for some languages. Specifically, this will return
+ * incorrect results for Lithuanian, Turkish, and Azeri.
+ * <p>
+ * Note: Invalid UTF-8 sequences are copied directly to the output.
+ */
+ public static Slice toLowerCase(Slice utf8)
+ {
+ return translateCodePoints(utf8, LOWER_CODE_POINTS);
+ }
+
+ private static Slice translateCodePoints(Slice utf8, int[] codePointTranslationMap)
+ {
+ int length = utf8.length();
+ Slice newUtf8 = Slices.allocate(length);
+
+ int position = 0;
+ int upperPosition = 0;
+ while (position < length) {
+ int codePoint = tryGetCodePointAt(utf8, position);
+ if (codePoint >= 0) {
+ int upperCodePoint = codePointTranslationMap[codePoint];
+
+ // grow slice if necessary
+ int nextUpperPosition = upperPosition + lengthOfCodePoint(upperCodePoint);
+ if (nextUpperPosition > length) {
+ newUtf8 = Slices.ensureSize(newUtf8, nextUpperPosition);
+ }
+
+ // write new byte
+ setCodePointAt(upperCodePoint, newUtf8, upperPosition);
+
+ position += lengthOfCodePoint(codePoint);
+ upperPosition = nextUpperPosition;
+ }
+ else {
+ int skipLength = -codePoint;
+
+ // grow slice if necessary
+ int nextUpperPosition = upperPosition + skipLength;
+ if (nextUpperPosition > length) {
+ newUtf8 = Slices.ensureSize(newUtf8, nextUpperPosition);
+ }
+
+ copyUtf8SequenceUnsafe(utf8, position, newUtf8, upperPosition, skipLength);
+ position += skipLength;
+ upperPosition = nextUpperPosition;
+ }
+ }
+ return newUtf8.slice(0, upperPosition);
+ }
+
+ private static void copyUtf8SequenceUnsafe(Slice source, int sourcePosition, Slice destination, int destinationPosition, int length)
+ {
+ switch (length) {
+ case 1:
+ destination.setByteUnchecked(destinationPosition, source.getByteUnchecked(sourcePosition));
+ break;
+ case 2:
+ destination.setShortUnchecked(destinationPosition, source.getShortUnchecked(sourcePosition));
+ break;
+ case 3:
+ destination.setShortUnchecked(destinationPosition, source.getShortUnchecked(sourcePosition));
+ destination.setByteUnchecked(destinationPosition + 2, source.getByteUnchecked(sourcePosition + 2));
+ break;
+ case 4:
+ destination.setIntUnchecked(destinationPosition, source.getIntUnchecked(sourcePosition));
+ break;
+ case 5:
+ destination.setIntUnchecked(destinationPosition, source.getIntUnchecked(sourcePosition));
+ destination.setByteUnchecked(destinationPosition + 4, source.getByteUnchecked(sourcePosition + 4));
+ break;
+ case 6:
+ destination.setIntUnchecked(destinationPosition, source.getIntUnchecked(sourcePosition));
+ destination.setShortUnchecked(destinationPosition + 4, source.getShortUnchecked(sourcePosition + 4));
+ break;
+ default:
+ throw new IllegalStateException("Invalid code point length " + length);
+ }
+ }
+
+ /**
+ * Removes all white space characters from the left string of the string.
+ * <p>
+ * Note: Invalid UTF-8 sequences are not trimmed.
+ */
+ public static Slice leftTrim(Slice utf8)
+ {
+ int length = utf8.length();
+
+ int position = firstNonWhitespacePosition(utf8);
+ return utf8.slice(position, length - position);
+ }
+
+ private static int firstNonWhitespacePosition(Slice utf8)
+ {
+ int length = utf8.length();
+
+ int position = 0;
+ while (position < length) {
+ int codePoint = tryGetCodePointAt(utf8, position);
+ if (codePoint < 0) {
+ break;
+ }
+ if (!WHITESPACE_CODE_POINTS[codePoint]) {
+ break;
+ }
+ position += lengthOfCodePoint(codePoint);
+ }
+ return position;
+ }
+
+ /**
+ * Removes all white space characters from the right side of the string.
+ * <p>
+ * Note: Invalid UTF-8 sequences are not trimmed.
+ */
+ public static Slice rightTrim(Slice utf8)
+ {
+ int position = lastNonWhitespacePosition(utf8, 0);
+ return utf8.slice(0, position);
+ }
+
+ private static int lastNonWhitespacePosition(Slice utf8, int minPosition)
+ {
+ int length = utf8.length();
+
+ int position = length;
+ while (minPosition < position) {
+ // decode the code point before position if possible
+ int codePoint;
+ byte unsignedByte = utf8.getByte(position - 1);
+ if (!isContinuationByte(unsignedByte)) {
+ codePoint = unsignedByte & 0xFF;
+ }
+ else if (minPosition <= position -2 && !isContinuationByte(utf8.getByte(position - 2))) {
+ codePoint = tryGetCodePointAt(utf8, position - 2);
+ }
+ else if (minPosition <= position -3 && !isContinuationByte(utf8.getByte(position - 3))) {
+ codePoint = tryGetCodePointAt(utf8, position - 3);
+ }
+ else if (minPosition <= position -4 && !isContinuationByte(utf8.getByte(position - 4))) {
+ codePoint = tryGetCodePointAt(utf8, position - 4);
+ }
+ else {
+ break;
+ }
+
+ if (codePoint < 0 || !WHITESPACE_CODE_POINTS[codePoint]) {
+ break;
+ }
+ position -= lengthOfCodePoint(codePoint);
+ }
+ return position;
+ }
+
+ /**
+ * Removes all white space characters from the left and right side of the string.
+ * <p>
+ * Note: Invalid UTF-8 sequences are not trimmed.
+ */
+ public static Slice trim(Slice utf8)
+ {
+ int start = firstNonWhitespacePosition(utf8);
+ int end = lastNonWhitespacePosition(utf8, start);
+ return utf8.slice(start, end - start);
+ }
+
+ public static Slice fixInvalidUtf8(Slice slice)
+ {
+ return fixInvalidUtf8(slice, OptionalInt.of(REPLACEMENT_CODE_POINT));
+ }
+
+ public static Slice fixInvalidUtf8(Slice slice, OptionalInt replacementCodePoint)
+ {
+ if (isAscii(slice)) {
+ return slice;
+ }
+
+ int replacementCodePointValue = -1;
+ int replacementCodePointLength = 0;
+ if (replacementCodePoint.isPresent()) {
+ replacementCodePointValue = replacementCodePoint.getAsInt();
+ replacementCodePointLength = lengthOfCodePoint(replacementCodePointValue);
+ }
+
+ int length = slice.length();
+ Slice utf8 = Slices.allocate(length);
+
+ int dataPosition = 0;
+ int utf8Position = 0;
+ while (dataPosition < length) {
+ int codePoint = tryGetCodePointAt(slice, dataPosition);
+ int codePointLength;
+ if (codePoint >= 0) {
+ codePointLength = lengthOfCodePoint(codePoint);
+ dataPosition += codePointLength;
+ }
+ else {
+ // negative number carries the number of invalid bytes
+ dataPosition += (-codePoint);
+ if (replacementCodePointValue < 0) {
+ continue;
+ }
+ codePoint = replacementCodePointValue;
+ codePointLength = replacementCodePointLength;
+ }
+ utf8 = Slices.ensureSize(utf8, utf8Position + codePointLength);
+ utf8Position += setCodePointAt(codePoint, utf8, utf8Position);
+ }
+ return utf8.slice(0, utf8Position);
+ }
+
+ /**
+ * Tries to get the UTF-8 encoded code point at the {@code position}. A positive
+ * return value means the UTF-8 sequence at the position is valid, and the result
+ * is the code point. A negative return value means the UTF-8 sequence at the
+ * position is invalid, and the length of the invalid sequence is the absolute
+ * value of the result.
+ * @return the code point or negative the number of bytes in the invalid UTF-8 sequence.
+ */
+ public static int tryGetCodePointAt(Slice utf8, int position)
+ {
+ //
+ // Process first byte
+ byte firstByte = utf8.getByte(position);
+
+ int length = lengthOfCodePointFromStartByteSafe(firstByte);
+ if (length < 0) {
+ return length;
+ }
+
+ if (length == 1) {
+ // normal ASCII
+ // 0xxx_xxxx
+ return firstByte;
+ }
+
+ //
+ // Process second byte
+ if (position + 1 >= utf8.length()) {
+ return -1;
+ }
+
+ byte secondByte = utf8.getByteUnchecked(position + 1);
+ if (!isContinuationByte(secondByte)) {
+ return -1;
+ }
+
+ if (length == 2) {
+ // 110x_xxxx 10xx_xxxx
+ return ((firstByte & 0b0001_1111) << 6) |
+ (secondByte & 0b0011_1111);
+ }
+
+ //
+ // Process third byte
+ if (position + 2 >= utf8.length()) {
+ return -2;
+ }
+
+ byte thirdByte = utf8.getByteUnchecked(position + 2);
+ if (!isContinuationByte(thirdByte)) {
+ return -2;
+ }
+
+ if (length == 3) {
+ // 1110_xxxx 10xx_xxxx 10xx_xxxx
+ int codePoint = ((firstByte & 0b0000_1111) << 12) |
+ ((secondByte & 0b0011_1111) << 6) |
+ (thirdByte & 0b0011_1111);
+
+ // surrogates are invalid
+ if (MIN_SURROGATE <= codePoint && codePoint <= MAX_SURROGATE) {
+ return -3;
+ }
+ return codePoint;
+ }
+
+ //
+ // Process forth byte
+ if (position + 3 >= utf8.length()) {
+ return -3;
+ }
+
+ byte forthByte = utf8.getByteUnchecked(position + 3);
+ if (!isContinuationByte(forthByte)) {
+ return -3;
+ }
+
+ if (length == 4) {
+ // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
+ int codePoint = ((firstByte & 0b0000_0111) << 18) |
+ ((secondByte & 0b0011_1111) << 12) |
+ ((thirdByte & 0b0011_1111) << 6) |
+ (forthByte & 0b0011_1111);
+ // 4 byte code points have a limited valid range
+ if (codePoint < 0x11_0000) {
+ return codePoint;
+ }
+ return -4;
+ }
+
+ //
+ // Process fifth byte
+ if (position + 4 >= utf8.length()) {
+ return -4;
+ }
+
+ byte fifthByte = utf8.getByteUnchecked(position + 4);
+ if (!isContinuationByte(fifthByte)) {
+ return -4;
+ }
+
+ if (length == 5) {
+ // Per RFC3629, UTF-8 is limited to 4 bytes, so more bytes are illegal
+ return -5;
+ }
+
+ //
+ // Process sixth byte
+ if (position + 5 >= utf8.length()) {
+ return -5;
+ }
+
+ byte sixthByte = utf8.getByteUnchecked(position + 5);
+ if (!isContinuationByte(sixthByte)) {
+ return -5;
+ }
+
+ if (length == 6) {
+ // Per RFC3629, UTF-8 is limited to 4 bytes, so more bytes are illegal
+ return -6;
+ }
+
+ // for longer sequence, which can't happen
+ return -1;
+ }
+
+ static int lengthOfCodePointFromStartByteSafe(byte startByte)
+ {
+ int unsignedStartByte = startByte & 0xFF;
+ if (unsignedStartByte < 0b1000_0000) {
+ // normal ASCII
+ // 0xxx_xxxx
+ return 1;
+ }
+ if (unsignedStartByte < 0b1100_0000) {
+ // illegal bytes
+ // 10xx_xxxx
+ return -1;
+ }
+ if (unsignedStartByte < 0b1110_0000) {
+ // 110x_xxxx 10xx_xxxx
+ return 2;
+ }
+ if (unsignedStartByte < 0b1111_0000) {
+ // 1110_xxxx 10xx_xxxx 10xx_xxxx
+ return 3;
+ }
+ if (unsignedStartByte < 0b1111_1000) {
+ // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
+ return 4;
+ }
+ if (unsignedStartByte < 0b1111_1100) {
+ // 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
+ return 5;
+ }
+ if (unsignedStartByte < 0b1111_1110) {
+ // 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
+ return 6;
+ }
+ return -1;
+ }
+
+ /**
+ * Finds the index of the first byte of the code point at a position, or
+ * {@code -1} if the position is not withing the slice.
+ * <p>
+ * Note: This method does not explicitly check for valid UTF-8, and may
+ * return incorrect results or throw an exception for invalid UTF-8.
+ */
+ public static int offsetOfCodePoint(Slice utf8, int codePointCount)
+ {
+ return offsetOfCodePoint(utf8, 0, codePointCount);
+ }
+
+ /**
+ * Starting from {@code position} bytes in {@code utf8}, finds the
+ * index of the first byte of the code point {@code codePointCount}
+ * in the slice. If the slice does not contain
+ * {@code codePointCount} code points after {@code position}, {@code -1}
+ * is returned.
+ * <p>
+ * Note: This method does not explicitly check for valid UTF-8, and may
+ * return incorrect results or throw an exception for invalid UTF-8.
+ */
+ public static int offsetOfCodePoint(Slice utf8, int position, int codePointCount)
+ {
+ checkPositionIndex(position, utf8.length());
+ checkArgument(codePointCount >= 0, "codePointPosition is negative");
+
+ // Quick exit if we are sure that the position is after the end
+ if (utf8.length() - position <= codePointCount) {
+ return -1;
+ }
+ if (codePointCount == 0) {
+ return position;
+ }
+
+ int correctIndex = codePointCount + position;
+ // Length rounded to 8 bytes
+ int length8 = utf8.length() & 0x7FFF_FFF8;
+ // While we have enough bytes left and we need at least 8 characters process 8 bytes at once
+ while (position < length8 && correctIndex >= position + 8) {
+ // Count bytes which are NOT the start of a code point
+ correctIndex += countContinuationBytes(utf8.getLongUnchecked(position));
+
+ position += 8;
+ }
+ // Length rounded to 4 bytes
+ int length4 = utf8.length() & 0x7FFF_FFFC;
+ // While we have enough bytes left and we need at least 4 characters process 4 bytes at once
+ while (position < length4 && correctIndex >= position + 4) {
+ // Count bytes which are NOT the start of a code point
+ correctIndex += countContinuationBytes(utf8.getIntUnchecked(position));
+
+ position += 4;
+ }
+ // Do the rest one by one, always check the last byte to find the end of the code point
+ while (position < utf8.length()) {
+ // Count bytes which are NOT the start of a code point
+ correctIndex += countContinuationBytes(utf8.getByteUnchecked(position));
+ if (position == correctIndex) {
+ break;
+ }
+
+ position++;
+ }
+
+ if (position == correctIndex && correctIndex < utf8.length()) {
+ return correctIndex;
+ }
+ return -1;
+ }
+
+ /**
+ * Gets the UTF-8 sequence length of the code point at {@code position}.
+ * <p>
+ * Note: This method does not explicitly check for valid UTF-8, and may
+ * return incorrect results or throw an exception for invalid UTF-8.
+ */
+ public static int lengthOfCodePoint(Slice utf8, int position)
+ {
+ return lengthOfCodePointFromStartByte(utf8.getByte(position));
+ }
+
+ /**
+ * Gets the UTF-8 sequence length of the code point at {@code position}.
+ * <p>
+ * Truncated UTF-8 sequences, 5 and 6 byte sequences, and invalid code points
+ * are handled by this method without throwing an exception.
+ */
+ public static int lengthOfCodePointSafe(Slice utf8, int position)
+ {
+ int length = lengthOfCodePointFromStartByteSafe(utf8.getByte(position));
+ if (length < 0) {
+ return -length;
+ }
+
+ if (length == 1 || position + 1 >= utf8.length() || !isContinuationByte(utf8.getByteUnchecked(position + 1))) {
+ return 1;
+ }
+
+ if (length == 2 || position + 2 >= utf8.length() || !isContinuationByte(utf8.getByteUnchecked(position + 2))) {
+ return 2;
+ }
+
+ if (length == 3 || position + 3 >= utf8.length() || !isContinuationByte(utf8.getByteUnchecked(position + 3))) {
+ return 3;
+ }
+
+ if (length == 4 || position + 4 >= utf8.length() || !isContinuationByte(utf8.getByteUnchecked(position + 4))) {
+ return 4;
+ }
+
+ if (length == 5 || position + 5 >= utf8.length() || !isContinuationByte(utf8.getByteUnchecked(position + 5))) {
+ return 5;
+ }
+
+ if (length == 6) {
+ return 6;
+ }
+
+ return 1;
+ }
+
+ /**
+ * Gets the UTF-8 sequence length of the code point.
+ *
+ * @throws InvalidCodePointException if code point is not within a valid range
+ */
+ public static int lengthOfCodePoint(int codePoint)
+ {
+ if (codePoint < 0) {
+ throw new InvalidCodePointException(codePoint);
+ }
+ if (codePoint < 0x80) {
+ // normal ASCII
+ // 0xxx_xxxx
+ return 1;
+ }
+ if (codePoint < 0x800) {
+ return 2;
+ }
+ if (codePoint < 0x1_0000) {
+ return 3;
+ }
+ if (codePoint < 0x11_0000) {
+ return 4;
+ }
+ // Per RFC3629, UTF-8 is limited to 4 bytes, so more bytes are illegal
+ throw new InvalidCodePointException(codePoint);
+ }
+
+ /**
+ * Gets the UTF-8 sequence length using the sequence start byte.
+ * <p>
+ * Note: This method does not explicitly check for valid UTF-8, and may
+ * return incorrect results or throw an exception for invalid UTF-8.
+ */
+ public static int lengthOfCodePointFromStartByte(byte startByte)
+ {
+ int unsignedStartByte = startByte & 0xFF;
+ if (unsignedStartByte < 0x80) {
+ // normal ASCII
+ // 0xxx_xxxx
+ return 1;
+ }
+ if (unsignedStartByte < 0xc0) {
+ // illegal bytes
+ // 10xx_xxxx
+ throw new InvalidUtf8Exception("Illegal start 0x" + toHexString(unsignedStartByte).toUpperCase() + " of code point");
+ }
+ if (unsignedStartByte < 0xe0) {
+ // 110x_xxxx 10xx_xxxx
+ return 2;
+ }
+ if (unsignedStartByte < 0xf0) {
+ // 1110_xxxx 10xx_xxxx 10xx_xxxx
+ return 3;
+ }
+ if (unsignedStartByte < 0xf8) {
+ // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
+ return 4;
+ }
+ // Per RFC3629, UTF-8 is limited to 4 bytes, so more bytes are illegal
+ throw new InvalidUtf8Exception("Illegal start 0x" + toHexString(unsignedStartByte).toUpperCase() + " of code point");
+ }
+
+ /**
+ * Gets the UTF-8 encoded code point at the {@code position}.
+ * <p>
+ * Note: This method does not explicitly check for valid UTF-8, and may
+ * return incorrect results or throw an exception for invalid UTF-8.
+ */
+ public static int getCodePointAt(Slice utf8, int position)
+ {
+ int unsignedStartByte = utf8.getByte(position) & 0xFF;
+ if (unsignedStartByte < 0x80) {
+ // normal ASCII
+ // 0xxx_xxxx
+ return unsignedStartByte;
+ }
+ if (unsignedStartByte < 0xc0) {
+ // illegal bytes
+ // 10xx_xxxx
+ throw new InvalidUtf8Exception("Illegal start 0x" + toHexString(unsignedStartByte).toUpperCase() + " of code point");
+ }
+ if (unsignedStartByte < 0xe0) {
+ // 110x_xxxx 10xx_xxxx
+ if (position + 1 >= utf8.length()) {
+ throw new InvalidUtf8Exception("UTF-8 sequence truncated");
+ }
+ return ((unsignedStartByte & 0b0001_1111) << 6) |
+ (utf8.getByte(position + 1) & 0b0011_1111);
+ }
+ if (unsignedStartByte < 0xf0) {
+ // 1110_xxxx 10xx_xxxx 10xx_xxxx
+ if (position + 2 >= utf8.length()) {
+ throw new InvalidUtf8Exception("UTF-8 sequence truncated");
+ }
+ return ((unsignedStartByte & 0b0000_1111) << 12) |
+ ((utf8.getByteUnchecked(position + 1) & 0b0011_1111) << 6) |
+ (utf8.getByteUnchecked(position + 2) & 0b0011_1111);
+ }
+ if (unsignedStartByte < 0xf8) {
+ // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
+ if (position + 3 >= utf8.length()) {
+ throw new InvalidUtf8Exception("UTF-8 sequence truncated");
+ }
+ return ((unsignedStartByte & 0b0000_0111) << 18) |
+ ((utf8.getByteUnchecked(position + 1) & 0b0011_1111) << 12) |
+ ((utf8.getByteUnchecked(position + 2) & 0b0011_1111) << 6) |
+ (utf8.getByteUnchecked(position + 3) & 0b0011_1111);
+ }
+ // Per RFC3629, UTF-8 is limited to 4 bytes, so more bytes are illegal
+ throw new InvalidUtf8Exception("Illegal start 0x" + toHexString(unsignedStartByte).toUpperCase() + " of code point");
+ }
+
+ /**
+ * Gets the UTF-8 encoded code point before the {@code position}.
+ * <p>
+ * Note: This method does not explicitly check for valid UTF-8, and may
+ * return incorrect results or throw an exception for invalid UTF-8.
+ */
+ public static int getCodePointBefore(Slice utf8, int position)
+ {
+ byte unsignedByte = utf8.getByte(position - 1);
+ if (!isContinuationByte(unsignedByte)) {
+ return unsignedByte & 0xFF;
+ }
+ if (!isContinuationByte(utf8.getByte(position - 2))) {
+ return getCodePointAt(utf8, position - 2);
+ }
+ if (!isContinuationByte(utf8.getByte(position - 3))) {
+ return getCodePointAt(utf8, position - 3);
+ }
+ if (!isContinuationByte(utf8.getByte(position - 4))) {
+ return getCodePointAt(utf8, position - 4);
+ }
+
+ // Per RFC3629, UTF-8 is limited to 4 bytes, so more bytes are illegal
+ throw new InvalidUtf8Exception("UTF-8 is not well formed");
+ }
+
+ private static boolean isContinuationByte(byte b)
+ {
+ return (b & 0b1100_0000) == 0b1000_0000;
+ }
+
+ /**
+ * Convert the code point to UTF-8.
+ * <p>
+ *
+ * @throws InvalidCodePointException if code point is not within a valid range
+ */
+ public static Slice codePointToUtf8(int codePoint)
+ {
+ Slice utf8 = Slices.allocate(lengthOfCodePoint(codePoint));
+ setCodePointAt(codePoint, utf8, 0);
+ return utf8;
+ }
+
+ /**
+ * Sets the UTF-8 sequence for code point at the {@code position}.
+ *
+ * @throws InvalidCodePointException if code point is not within a valid range
+ */
+ public static int setCodePointAt(int codePoint, Slice utf8, int position)
+ {
+ if (codePoint < 0) {
+ throw new InvalidCodePointException(codePoint);
+ }
+ if (codePoint < 0x80) {
+ // normal ASCII
+ // 0xxx_xxxx
+ utf8.setByte(position, codePoint);
+ return 1;
+ }
+ if (codePoint < 0x800) {
+ // 110x_xxxx 10xx_xxxx
+ utf8.setByte(position, 0b1100_0000 | (codePoint >>> 6));
+ utf8.setByte(position + 1, 0b1000_0000 | (codePoint & 0b0011_1111));
+ return 2;
+ }
+ if (MIN_SURROGATE <= codePoint && codePoint <= MAX_SURROGATE) {
+ throw new InvalidCodePointException(codePoint);
+ }
+ if (codePoint < 0x1_0000) {
+ // 1110_xxxx 10xx_xxxx 10xx_xxxx
+ utf8.setByte(position, 0b1110_0000 | ((codePoint >>> 12) & 0b0000_1111));
+ utf8.setByte(position + 1, 0b1000_0000 | ((codePoint >>> 6) & 0b0011_1111));
+ utf8.setByte(position + 2, 0b1000_0000 | (codePoint & 0b0011_1111));
+ return 3;
+ }
+ if (codePoint < 0x11_0000) {
+ // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
+ utf8.setByte(position, 0b1111_0000 | ((codePoint >>> 18) & 0b0000_0111));
+ utf8.setByte(position + 1, 0b1000_0000 | ((codePoint >>> 12) & 0b0011_1111));
+ utf8.setByte(position + 2, 0b1000_0000 | ((codePoint >>> 6) & 0b0011_1111));
+ utf8.setByte(position + 3, 0b1000_0000 | (codePoint & 0b0011_1111));
+ return 4;
+ }
+ // Per RFC3629, UTF-8 is limited to 4 bytes, so more bytes are illegal
+ throw new InvalidCodePointException(codePoint);
+ }
+
+ private static int countContinuationBytes(byte i8)
+ {
+ // see below
+ int value = i8 & 0xff;
+ return (value >>> 7) & (~value >>> 6);
+ }
+
+ private static int countContinuationBytes(int i32)
+ {
+ // see below
+ i32 = ((i32 & TOP_MASK32) >>> 1) & (~i32);
+ return Integer.bitCount(i32);
+ }
+
+ private static int countContinuationBytes(long i64)
+ {
+ // Count the number of bytes that match 0b10xx_xxxx as follows:
+ // 1. Mask off the 8th bit of every byte and shift it into the 7th position.
+ // 2. Then invert the bytes, which turns the 0 in the 7th bit to a one.
+ // 3. And together the restults of step 1 and 2, giving us a one in the 7th
+ // position if the byte matched.
+ // 4. Count the number of bits in the result, which is the number of bytes
+ // that matched.
+ i64 = ((i64 & TOP_MASK64) >>> 1) & (~i64);
+ return Long.bitCount(i64);
+ }
+}
=====================================
src/main/java/io/airlift/slice/Slices.java
=====================================
@@ -59,7 +59,7 @@ public final class Slices
else {
newCapacity = existingSlice.length();
}
- int minNewCapacity = existingSlice.length() + minWritableBytes;
+ int minNewCapacity = minWritableBytes;
while (newCapacity < minNewCapacity) {
if (newCapacity < SLICE_ALLOC_THRESHOLD) {
newCapacity <<= 1;
@@ -69,7 +69,7 @@ public final class Slices
}
}
- Slice newSlice = Slices.allocate(newCapacity);
+ Slice newSlice = allocate(newCapacity);
newSlice.setBytes(0, existingSlice, 0, existingSlice.length());
return newSlice;
}
@@ -112,18 +112,18 @@ public final class Slices
{
if (buffer instanceof DirectBuffer) {
DirectBuffer direct = (DirectBuffer) buffer;
- return new Slice(null, direct.address() + buffer.position(), buffer.limit() - buffer.position(), direct);
+ return new Slice(null, direct.address() + buffer.position(), buffer.limit() - buffer.position(), buffer.capacity(), direct);
}
if (buffer.hasArray()) {
int address = ARRAY_BYTE_BASE_OFFSET + buffer.arrayOffset() + buffer.position();
- return new Slice(buffer.array(), address, buffer.limit() - buffer.position(), null);
+ return new Slice(buffer.array(), address, buffer.limit() - buffer.position(), buffer.array().length, null);
}
throw new IllegalArgumentException("cannot wrap " + buffer.getClass().getName());
}
- public static Slice wrappedBuffer(byte[] array)
+ public static Slice wrappedBuffer(byte... array)
{
if (array.length == 0) {
return EMPTY_SLICE;
=====================================
src/main/java/io/airlift/slice/UnsafeSliceFactory.java
=====================================
@@ -70,7 +70,7 @@ public class UnsafeSliceFactory
if (size == 0) {
return Slices.EMPTY_SLICE;
}
- return new Slice(null, address, size, null);
+ return new Slice(null, address, size, 0, null);
}
/**
@@ -95,6 +95,6 @@ public class UnsafeSliceFactory
if (size == 0) {
return Slices.EMPTY_SLICE;
}
- return new Slice(null, address, size, reference);
+ return new Slice(null, address, size, size, reference);
}
}
=====================================
src/main/java/io/airlift/slice/XxHash64.java
=====================================
@@ -56,39 +56,11 @@ public class XxHash64
checkPositionIndexes(0, offset + length, data.length());
Object base = data.getBase();
- long index = data.getAddress() + offset;
- long end = index + length;
+ final long address = data.getAddress() + offset;
long hash;
-
if (length >= 32) {
- long v1 = seed + PRIME64_1 + PRIME64_2;
- long v2 = seed + PRIME64_2;
- long v3 = seed + 0;
- long v4 = seed - PRIME64_1;
-
- long limit = end - 32;
- do {
- v1 = mix(v1, unsafe.getLong(base, index));
- index += 8;
-
- v2 = mix(v2, unsafe.getLong(base, index));
- index += 8;
-
- v3 = mix(v3, unsafe.getLong(base, index));
- index += 8;
-
- v4 = mix(v4, unsafe.getLong(base, index));
- index += 8;
- }
- while (index <= limit);
-
- hash = rotateLeft(v1, 1) + rotateLeft(v2, 7) + rotateLeft(v3, 12) + rotateLeft(v4, 18);
-
- hash = update(hash, v1);
- hash = update(hash, v2);
- hash = update(hash, v3);
- hash = update(hash, v4);
+ hash = updateBody(seed, base, address, length - 32);
}
else {
hash = seed + PRIME64_5;
@@ -96,18 +68,22 @@ public class XxHash64
hash += length;
- while (index <= end - 8) {
- hash = updateTail(hash, unsafe.getLong(base, index));
+ // round to the closest 32 byte boundary
+ // this is the point up to which {@see #updateBody} processed
+ int index = length & 0xFFFFFF70;
+
+ while (index <= length - 8) {
+ hash = updateTail(hash, unsafe.getLong(base, address + index));
index += 8;
}
- if (index <= end - 4) {
- hash = updateTail(hash, unsafe.getInt(base, index));
+ if (index <= length - 4) {
+ hash = updateTail(hash, unsafe.getInt(base, address + index));
index += 4;
}
- while (index < end) {
- hash = updateTail(hash, unsafe.getByte(base, index));
+ while (index < length) {
+ hash = updateTail(hash, unsafe.getByte(base, address + index));
index++;
}
@@ -116,6 +92,32 @@ public class XxHash64
return hash;
}
+ private static long updateBody(long seed, Object base, long address, int length)
+ {
+ long v1 = seed + PRIME64_1 + PRIME64_2;
+ long v2 = seed + PRIME64_2;
+ long v3 = seed + 0;
+ long v4 = seed - PRIME64_1;
+
+ for (int index = 0; index <= length; index += 32) {
+ v1 = mix(v1, unsafe.getLong(base, address));
+ v2 = mix(v2, unsafe.getLong(base, address + 8));
+ v3 = mix(v3, unsafe.getLong(base, address + 16));
+ v4 = mix(v4, unsafe.getLong(base, address + 24));
+
+ address += 32;
+ }
+
+ long hash = rotateLeft(v1, 1) + rotateLeft(v2, 7) + rotateLeft(v3, 12) + rotateLeft(v4, 18);
+
+ hash = update(hash, v1);
+ hash = update(hash, v2);
+ hash = update(hash, v3);
+ hash = update(hash, v4);
+
+ return hash;
+ }
+
private static long mix(long current, long value)
{
return rotateLeft(current + value * PRIME64_2, 31) * PRIME64_1;
=====================================
src/test/java/io/airlift/slice/SliceUtf8Benchmark.java
=====================================
@@ -0,0 +1,275 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.airlift.slice;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.RunnerException;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+import org.openjdk.jmh.runner.options.VerboseMode;
+
+import java.nio.charset.StandardCharsets;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.stream.IntStream;
+
+import static io.airlift.slice.SliceUtf8.countCodePoints;
+import static io.airlift.slice.SliceUtf8.leftTrim;
+import static io.airlift.slice.SliceUtf8.lengthOfCodePoint;
+import static io.airlift.slice.SliceUtf8.lengthOfCodePointFromStartByte;
+import static io.airlift.slice.SliceUtf8.offsetOfCodePoint;
+import static io.airlift.slice.SliceUtf8.reverse;
+import static io.airlift.slice.SliceUtf8.rightTrim;
+import static io.airlift.slice.SliceUtf8.substring;
+import static io.airlift.slice.SliceUtf8.toLowerCase;
+import static io.airlift.slice.SliceUtf8.toUpperCase;
+import static io.airlift.slice.SliceUtf8.trim;
+import static io.airlift.slice.Slices.utf8Slice;
+import static java.lang.Character.MAX_CODE_POINT;
+import static java.lang.Character.SURROGATE;
+import static java.lang.Character.getType;
+import static java.util.concurrent.TimeUnit.MILLISECONDS;
+import static java.util.concurrent.TimeUnit.NANOSECONDS;
+import static org.openjdk.jmh.annotations.Mode.AverageTime;
+import static org.openjdk.jmh.annotations.Scope.Thread;
+
+ at SuppressWarnings("MethodMayBeStatic")
+ at State(Thread)
+ at OutputTimeUnit(NANOSECONDS)
+ at BenchmarkMode(AverageTime)
+ at Fork(1)
+ at Warmup(iterations = 4, time = 500, timeUnit = MILLISECONDS)
+ at Measurement(iterations = 5, time = 500, timeUnit = MILLISECONDS)
+public class SliceUtf8Benchmark
+{
+ @Benchmark
+ public int benchmarkLengthOfCodePointFromStartByte(BenchmarkData data)
+ {
+ Slice slice = data.getSlice();
+ int i = 0;
+ int codePoints = 0;
+ while (i < slice.length()) {
+ i += lengthOfCodePointFromStartByte(slice.getByte(i));
+ codePoints++;
+ }
+ if (codePoints != data.getLength()) {
+ throw new AssertionError();
+ }
+ return codePoints;
+ }
+
+ @Benchmark
+ public int benchmarkCountCodePoints(BenchmarkData data)
+ {
+ int codePoints = countCodePoints(data.getSlice());
+ if (codePoints != data.getLength()) {
+ throw new AssertionError();
+ }
+ return codePoints;
+ }
+
+ @Benchmark
+ public int benchmarkOffsetByCodePoints(BenchmarkData data)
+ {
+ Slice slice = data.getSlice();
+ int offset = offsetOfCodePoint(slice, data.getLength() - 1);
+ if (offset + lengthOfCodePoint(slice, offset) != slice.length()) {
+ throw new AssertionError();
+ }
+ return offset;
+ }
+
+ @Benchmark
+ public Slice benchmarkSubstring(BenchmarkData data)
+ {
+ Slice slice = data.getSlice();
+ int length = data.getLength();
+ return substring(slice, (length / 2) - 1, length / 2);
+ }
+
+ @Benchmark
+ public Slice benchmarkReverse(BenchmarkData data)
+ {
+ return reverse(data.getSlice());
+ }
+
+ @Benchmark
+ public Slice benchmarkToLowerCase(BenchmarkData data)
+ {
+ return toLowerCase(data.getSlice());
+ }
+
+ @Benchmark
+ public Slice benchmarkToUpperCase(BenchmarkData data)
+ {
+ return toUpperCase(data.getSlice());
+ }
+
+ @Benchmark
+ public Slice benchmarkLeftTrim(WhitespaceData data)
+ {
+ return leftTrim(data.getLeftWhitespace());
+ }
+
+ @Benchmark
+ public Slice benchmarkRightTrim(WhitespaceData data)
+ {
+ return rightTrim(data.getRightWhitespace());
+ }
+
+ @Benchmark
+ public Slice benchmarkTrim(WhitespaceData data)
+ {
+ return trim(data.getBothWhitespace());
+ }
+
+
+ @State(Thread)
+ public static class BenchmarkData
+ {
+ private static final int[] ASCII_CODE_POINTS;
+ private static final int[] ALL_CODE_POINTS;
+
+ static {
+ ASCII_CODE_POINTS = IntStream.rangeClosed(0, 0x7F)
+ .toArray();
+ ALL_CODE_POINTS = IntStream.rangeClosed(0, MAX_CODE_POINT)
+ .filter(codePoint -> getType(codePoint) != SURROGATE)
+ .toArray();
+ }
+
+ @Param({ "2", "5", "10", "100", "1000", "10000" })
+ private int length;
+
+ @Param({ "true", "false" })
+ private boolean ascii;
+
+ private Slice slice;
+ private int[] codePoints;
+
+ @Setup
+ public void setup()
+ {
+ int[] codePointSet = ascii ? ASCII_CODE_POINTS : ALL_CODE_POINTS;
+ ThreadLocalRandom random = ThreadLocalRandom.current();
+
+ codePoints = new int[length];
+ DynamicSliceOutput sliceOutput = new DynamicSliceOutput(length * 4);
+ for (int i = 0; i < codePoints.length; i++) {
+ int codePoint = codePointSet[random.nextInt(codePointSet.length)];
+ codePoints[i] = codePoint;
+ sliceOutput.appendBytes(new String(Character.toChars(codePoint)).getBytes(StandardCharsets.UTF_8));
+ }
+ slice = sliceOutput.slice();
+ }
+
+ public Slice getSlice()
+ {
+ return slice;
+ }
+
+ public int getLength()
+ {
+ return length;
+ }
+ }
+
+ @State(Thread)
+ public static class WhitespaceData
+ {
+ private static final int[] ASCII_WHITESPACE;
+ private static final int[] ALL_WHITESPACE;
+
+ static {
+ ASCII_WHITESPACE = IntStream.rangeClosed(0, 0x7F)
+ .filter(Character::isWhitespace)
+ .toArray();
+ ALL_WHITESPACE = IntStream.rangeClosed(0, MAX_CODE_POINT)
+ .filter(Character::isWhitespace)
+ .toArray();
+ }
+
+ @Param({ "2", "5", "10", "100", "1000", "10000" })
+ private int length;
+
+ @Param({ "true", "false" })
+ private boolean ascii;
+
+ private Slice leftWhitespace;
+ private Slice rightWhitespace;
+ private Slice bothWhitespace;
+
+ @Setup
+ public void setup()
+ {
+ Slice whitespace = createRandomUtf8Slice(ascii ? ASCII_WHITESPACE : ALL_WHITESPACE, length + 1);
+ leftWhitespace = Slices.copyOf(whitespace);
+ leftWhitespace.setByte(leftWhitespace.length() - 1, 'X');
+ rightWhitespace = Slices.copyOf(whitespace);
+ rightWhitespace.setByte(0, 'X');
+ bothWhitespace = Slices.copyOf(whitespace);
+ bothWhitespace.setByte(length / 2, 'X');
+ }
+
+ private static Slice createRandomUtf8Slice(int[] codePointSet, int length)
+ {
+ int[] codePoints = new int[length];
+ ThreadLocalRandom random = ThreadLocalRandom.current();
+ for (int i = 0; i < codePoints.length; i++) {
+ int codePoint = codePointSet[random.nextInt(codePointSet.length)];
+ codePoints[i] = codePoint;
+ }
+ return utf8Slice(new String(codePoints, 0, codePoints.length));
+ }
+
+ public int getLength()
+ {
+ return length;
+ }
+
+ public Slice getLeftWhitespace()
+ {
+ return leftWhitespace;
+ }
+
+ public Slice getRightWhitespace()
+ {
+ return rightWhitespace;
+ }
+
+ public Slice getBothWhitespace()
+ {
+ return bothWhitespace;
+ }
+ }
+
+ public static void main(String[] args)
+ throws RunnerException
+ {
+ Options options = new OptionsBuilder()
+ .verbosity(VerboseMode.NORMAL)
+ .include(".*" + SliceUtf8Benchmark.class.getSimpleName() + ".*")
+ .build();
+
+ new Runner(options).run();
+ }
+}
=====================================
src/test/java/io/airlift/slice/TestSlice.java
=====================================
@@ -13,7 +13,6 @@
*/
package io.airlift.slice;
-import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.ByteArrayInputStream;
@@ -200,9 +199,9 @@ public class TestSlice
String s = "apple \u2603 snowman";
Slice slice = Slices.copiedBuffer(s, UTF_8);
- assertEquals(Slices.utf8Slice(s), slice);
+ assertEquals(utf8Slice(s), slice);
assertEquals(slice.toStringUtf8(), s);
- assertEquals(Slices.utf8Slice(s).toStringUtf8(), s);
+ assertEquals(utf8Slice(s).toStringUtf8(), s);
}
@SuppressWarnings("CharUsedInArithmeticContext")
@@ -676,29 +675,41 @@ public class TestSlice
assertEquals(slice.getBytes(), output.getBytes());
}
+ @Test
+ public void testRetainedSize()
+ throws Exception
+ {
+ Slice slice = Slices.allocate(10);
+ assertEquals(slice.getRetainedSize(), 10);
+ assertEquals(slice.length(), 10);
+ Slice subSlice = slice.slice(0, 1);
+ assertEquals(subSlice.getRetainedSize(), 10);
+ assertEquals(subSlice.length(), 1);
+ }
+
@Test
public void testCopyOf()
throws Exception
{
// slightly stronger guarantees for empty slice
assertSame(Slices.copyOf(EMPTY_SLICE), EMPTY_SLICE);
- assertSame(Slices.copyOf(Slices.utf8Slice("hello world"), 1, 0), EMPTY_SLICE);
+ assertSame(Slices.copyOf(utf8Slice("hello world"), 1, 0), EMPTY_SLICE);
- Slice slice = Slices.utf8Slice("hello world");
+ Slice slice = utf8Slice("hello world");
assertEquals(Slices.copyOf(slice), slice);
assertEquals(Slices.copyOf(slice, 1, 3), slice.slice(1, 3));
// verify it's an actual copy
- Slice original = Slices.utf8Slice("hello world");
+ Slice original = utf8Slice("hello world");
Slice copy = Slices.copyOf(original);
original.fill((byte) 0);
- assertEquals(copy, Slices.utf8Slice("hello world"));
+ assertEquals(copy, utf8Slice("hello world"));
// read before beginning
try {
Slices.copyOf(slice, -1, slice.length());
- Assert.fail();
+ fail();
}
catch (IndexOutOfBoundsException ignored) {
}
@@ -706,7 +717,7 @@ public class TestSlice
// read after end
try {
Slices.copyOf(slice, slice.length() + 1, 1);
- Assert.fail();
+ fail();
}
catch (IndexOutOfBoundsException ignored) {
}
@@ -714,12 +725,64 @@ public class TestSlice
// start before but extend past end
try {
Slices.copyOf(slice, 1, slice.length());
- Assert.fail();
+ fail();
}
catch (IndexOutOfBoundsException ignored) {
}
+ }
+
+ @Test
+ public void testIndexOf()
+ throws Exception
+ {
+ assertIndexOf(utf8Slice("no-match-bigger"), utf8Slice("test"));
+ assertIndexOf(utf8Slice("no"), utf8Slice("test"));
+
+ assertIndexOf(utf8Slice("test"), utf8Slice("test"));
+ assertIndexOf(utf8Slice("test-start"), utf8Slice("test"));
+ assertIndexOf(utf8Slice("end-test"), utf8Slice("test"));
+ assertIndexOf(utf8Slice("a-test-middle"), utf8Slice("test"));
+ assertIndexOf(utf8Slice("this-test-is-a-test"), utf8Slice("test"));
+
+ assertIndexOf(utf8Slice("test"), EMPTY_SLICE, 0, 0);
+ assertIndexOf(EMPTY_SLICE, utf8Slice("test"), 0, -1);
+ assertIndexOf(utf8Slice("test"), utf8Slice("no"), 4, -1);
+ assertIndexOf(utf8Slice("test"), utf8Slice("no"), 5, -1);
+ assertIndexOf(utf8Slice("test"), utf8Slice("no"), -1, -1);
+ }
+
+ public static void assertIndexOf(Slice data, Slice pattern, int offset, int expected)
+ {
+ assertEquals(data.indexOf(pattern, offset), expected);
+ assertEquals(data.indexOfBruteForce(pattern, offset), expected);
+ }
+
+ public static void assertIndexOf(Slice data, Slice pattern)
+ {
+ int index;
+
+ List<Integer> bruteForce = new ArrayList<>();
+ index = 0;
+ while (index >= 0 && index < data.length()) {
+ index = data.indexOfBruteForce(pattern, index);
+ if (index >= 0) {
+ bruteForce.add(index);
+ index++;
+ }
+ }
+
+ List<Integer> indexOf = new ArrayList<>();
+ index = 0;
+ while (index >= 0 && index < data.length()) {
+ index = data.indexOf(pattern, index);
+ if (index >= 0) {
+ indexOf.add(index);
+ index++;
+ }
+ }
+ assertEquals(bruteForce, indexOf);
}
private static List<Long> createRandomLongs(int count)
=====================================
src/test/java/io/airlift/slice/TestSliceUtf8.java
=====================================
@@ -0,0 +1,711 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.airlift.slice;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+import com.google.common.primitives.Ints;
+import org.testng.annotations.Test;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.stream.IntStream;
+
+import static com.google.common.primitives.Bytes.concat;
+import static io.airlift.slice.SliceUtf8.codePointToUtf8;
+import static io.airlift.slice.SliceUtf8.countCodePoints;
+import static io.airlift.slice.SliceUtf8.fixInvalidUtf8;
+import static io.airlift.slice.SliceUtf8.getCodePointAt;
+import static io.airlift.slice.SliceUtf8.getCodePointBefore;
+import static io.airlift.slice.SliceUtf8.isAscii;
+import static io.airlift.slice.SliceUtf8.leftTrim;
+import static io.airlift.slice.SliceUtf8.lengthOfCodePoint;
+import static io.airlift.slice.SliceUtf8.lengthOfCodePointFromStartByte;
+import static io.airlift.slice.SliceUtf8.lengthOfCodePointSafe;
+import static io.airlift.slice.SliceUtf8.offsetOfCodePoint;
+import static io.airlift.slice.SliceUtf8.reverse;
+import static io.airlift.slice.SliceUtf8.rightTrim;
+import static io.airlift.slice.SliceUtf8.setCodePointAt;
+import static io.airlift.slice.SliceUtf8.substring;
+import static io.airlift.slice.SliceUtf8.toLowerCase;
+import static io.airlift.slice.SliceUtf8.toUpperCase;
+import static io.airlift.slice.SliceUtf8.trim;
+import static io.airlift.slice.Slices.EMPTY_SLICE;
+import static io.airlift.slice.Slices.utf8Slice;
+import static io.airlift.slice.Slices.wrappedBuffer;
+import static java.lang.Character.MAX_CODE_POINT;
+import static java.lang.Character.MIN_SURROGATE;
+import static java.lang.Character.SURROGATE;
+import static java.lang.Character.getType;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertFalse;
+import static org.testng.Assert.assertTrue;
+
+public class TestSliceUtf8
+{
+ private static final int[] ASCII_CODE_POINTS;
+ private static final String STRING_ASCII_CODE_POINTS;
+ private static final int[] ALL_CODE_POINTS;
+ private static final String STRING_ALL_CODE_POINTS;
+ private static final int[] ALL_CODE_POINTS_RANDOM;
+ private static final String STRING_ALL_CODE_POINTS_RANDOM;
+
+ private static final byte START_1_BYTE = (byte) 0b0111_1111;
+ private static final byte CONTINUATION_BYTE = (byte) 0b1011_1111;
+ private static final byte START_2_BYTE = (byte) 0b1101_1111;
+ private static final byte START_3_BYTE = (byte) 0b1110_1111;
+ private static final byte START_4_BYTE = (byte) 0b1111_0111;
+ private static final byte START_5_BYTE = (byte) 0b1111_1011;
+ private static final byte START_6_BYTE = (byte) 0b1111_1101;
+ private static final byte INVALID_FE_BYTE = (byte) 0b11111110;
+ private static final byte INVALID_FF_BYTE = (byte) 0b11111111;
+ private static final byte X_CHAR = (byte) 'X';
+
+ private static final List<byte[]> INVALID_SEQUENCES;
+
+ static {
+ ASCII_CODE_POINTS = IntStream.rangeClosed(0, 0x7F)
+ .toArray();
+ STRING_ASCII_CODE_POINTS = new String(ASCII_CODE_POINTS, 0, ASCII_CODE_POINTS.length);
+
+ ALL_CODE_POINTS = IntStream.rangeClosed(0, MAX_CODE_POINT)
+ .filter(codePoint -> getType(codePoint) != SURROGATE)
+ .toArray();
+ STRING_ALL_CODE_POINTS = new String(ALL_CODE_POINTS, 0, ALL_CODE_POINTS.length);
+
+ ALL_CODE_POINTS_RANDOM = Arrays.copyOf(ALL_CODE_POINTS, ALL_CODE_POINTS.length);
+ Collections.shuffle(Arrays.asList(ALL_CODE_POINTS_RANDOM));
+ STRING_ALL_CODE_POINTS_RANDOM = new String(ALL_CODE_POINTS_RANDOM, 0, ALL_CODE_POINTS_RANDOM.length);
+
+ ImmutableList.Builder<byte[]> invalidSequences = ImmutableList.builder();
+ invalidSequences.add(new byte[] {CONTINUATION_BYTE});
+ invalidSequences.add(new byte[] {START_2_BYTE});
+ invalidSequences.add(new byte[] {START_3_BYTE});
+ invalidSequences.add(new byte[] {START_3_BYTE, CONTINUATION_BYTE});
+ invalidSequences.add(new byte[] {START_4_BYTE});
+ invalidSequences.add(new byte[] {START_4_BYTE, CONTINUATION_BYTE});
+ invalidSequences.add(new byte[] {START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+ // 4 byte sequence is limited to 10FFFF
+ invalidSequences.add(new byte[] {START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+ invalidSequences.add(new byte[] {START_5_BYTE});
+ invalidSequences.add(new byte[] {START_5_BYTE, CONTINUATION_BYTE});
+ invalidSequences.add(new byte[] {START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+ invalidSequences.add(new byte[] {START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+ invalidSequences.add(new byte[] {START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+ invalidSequences.add(new byte[] {START_6_BYTE});
+ invalidSequences.add(new byte[] {START_6_BYTE, CONTINUATION_BYTE});
+ invalidSequences.add(new byte[] {START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+ invalidSequences.add(new byte[] {START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+ invalidSequences.add(new byte[] {START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+ invalidSequences.add(new byte[] {START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+ invalidSequences.add(new byte[] {INVALID_FF_BYTE});
+
+ // min and max surrogate characters
+ invalidSequences.add(new byte[] {(byte) 0b11101101, (byte) 0xA0, (byte) 0x80});
+ invalidSequences.add(new byte[] {(byte) 0b11101101, (byte) 0xBF, (byte) 0xBF});
+ INVALID_SEQUENCES = invalidSequences.build();
+ }
+
+ private static final String STRING_EMPTY = "";
+ private static final String STRING_HELLO = "hello";
+ private static final String STRING_QUADRATICALLY = "Quadratically";
+ private static final String STRING_OESTERREICH = "\u00D6sterreich";
+ private static final String STRING_DULIOE_DULIOE = "Duli\u00F6 duli\u00F6";
+ private static final String STRING_FAITH_HOPE_LOVE = "\u4FE1\u5FF5,\u7231,\u5E0C\u671B";
+ private static final String STRING_NAIVE = "na\u00EFve";
+ private static final String STRING_OO = "\uD801\uDC2Dend";
+ // length increase when cast to lower case, and ends with invalid character
+ private static final byte[] INVALID_SEQUENCE_TO_LOWER_EXPANDS = new byte[] {(byte) 0xC8, (byte) 0xBA, (byte) 0xFF};
+
+ private static final byte[] INVALID_UTF8_1 = new byte[] {-127};
+ private static final byte[] INVALID_UTF8_2 = new byte[] {50, -127, 52, 50};
+
+ @Test
+ public void testCodePointCount()
+ {
+ assertCodePointCount(STRING_EMPTY);
+ assertCodePointCount(STRING_HELLO);
+ assertCodePointCount(STRING_QUADRATICALLY);
+ assertCodePointCount(STRING_OESTERREICH);
+ assertCodePointCount(STRING_DULIOE_DULIOE);
+ assertCodePointCount(STRING_FAITH_HOPE_LOVE);
+ assertCodePointCount(STRING_NAIVE);
+ assertCodePointCount(STRING_OO);
+ assertCodePointCount(STRING_ASCII_CODE_POINTS);
+ assertCodePointCount(STRING_ALL_CODE_POINTS);
+ assertCodePointCount(STRING_ALL_CODE_POINTS_RANDOM);
+
+ assertEquals(countCodePoints(wrappedBuffer(START_1_BYTE)), 1);
+ assertEquals(countCodePoints(wrappedBuffer(START_2_BYTE)), 1);
+ assertEquals(countCodePoints(wrappedBuffer(START_3_BYTE)), 1);
+ assertEquals(countCodePoints(wrappedBuffer(START_4_BYTE)), 1);
+ assertEquals(countCodePoints(wrappedBuffer(START_5_BYTE)), 1);
+ assertEquals(countCodePoints(wrappedBuffer(START_6_BYTE)), 1);
+ assertEquals(countCodePoints(wrappedBuffer(INVALID_FE_BYTE)), 1);
+ assertEquals(countCodePoints(wrappedBuffer(INVALID_FF_BYTE)), 1);
+ assertEquals(countCodePoints(wrappedBuffer(CONTINUATION_BYTE)), 0);
+ }
+
+ private static void assertCodePointCount(String string)
+ {
+ assertEquals(countCodePoints(utf8Slice(string)), string.codePoints().count());
+ }
+
+ @Test
+ public void testOffsetByCodePoints()
+ {
+ assertEquals(offsetOfCodePoint(EMPTY_SLICE, 0), -1);
+ assertOffsetByCodePoints(STRING_HELLO);
+ assertOffsetByCodePoints(STRING_QUADRATICALLY);
+ assertOffsetByCodePoints(STRING_OESTERREICH);
+ assertOffsetByCodePoints(STRING_DULIOE_DULIOE);
+ assertOffsetByCodePoints(STRING_FAITH_HOPE_LOVE);
+ assertOffsetByCodePoints(STRING_NAIVE);
+ assertOffsetByCodePoints(STRING_OO);
+ assertOffsetByCodePoints(STRING_ASCII_CODE_POINTS);
+ assertOffsetByCodePoints(STRING_ALL_CODE_POINTS);
+ assertOffsetByCodePoints(STRING_ALL_CODE_POINTS_RANDOM);
+ }
+
+ private static void assertOffsetByCodePoints(String string)
+ {
+ Slice utf8 = utf8Slice(string);
+
+ int codePoints = (int) string.codePoints().count();
+ int lastIndex = 0;
+ int characterIndex = 0;
+ for (int codePointIndex = 0; codePointIndex < codePoints; codePointIndex++) {
+ int expectedIndex = 0;
+
+ // calculate the expected index by searching forward from the last index
+ if (codePointIndex > 0) {
+ expectedIndex = lastIndex + lengthOfCodePoint(string.codePointAt(characterIndex));
+ characterIndex = string.offsetByCodePoints(characterIndex, 1);
+ }
+ // avoid n^2 performance for large test string
+ if (codePointIndex < 10000) {
+ assertEquals(offsetOfCodePoint(utf8, codePointIndex), expectedIndex);
+ }
+
+ if (codePointIndex > 0) {
+ assertEquals(offsetOfCodePoint(utf8, lastIndex, 1), expectedIndex);
+ }
+ lastIndex = expectedIndex;
+ }
+ assertEquals(offsetOfCodePoint(utf8Slice(string), codePoints), -1);
+ }
+
+ @Test
+ public void testSubstring()
+ {
+ assertSubstring(STRING_HELLO);
+ assertSubstring(STRING_QUADRATICALLY);
+ assertSubstring(STRING_OESTERREICH);
+ assertSubstring(STRING_DULIOE_DULIOE);
+ assertSubstring(STRING_FAITH_HOPE_LOVE);
+ assertSubstring(STRING_NAIVE);
+ assertSubstring(STRING_OO);
+ assertSubstring(STRING_ASCII_CODE_POINTS);
+ // substring test over all code points takes too long, so only run it on the tail
+ // that has the largest code points
+ assertSubstring(new String(ALL_CODE_POINTS, ALL_CODE_POINTS.length - 500, 500));
+ }
+
+ private static void assertSubstring(String string)
+ {
+ Slice utf8 = utf8Slice(string);
+
+ int[] codePoints = string.codePoints().toArray();
+ for (int start = 0; start < codePoints.length / 2; start++) {
+ int count = Math.min(20, codePoints.length - start - start - 1);
+ Slice actual = substring(utf8, start, count);
+ Slice expected = wrappedBuffer(new String(codePoints, start, count).getBytes(UTF_8));
+ assertEquals(actual, expected);
+ }
+ assertEquals(substring(utf8, 0, codePoints.length), utf8);
+ assertEquals(substring(utf8, 0, 0), EMPTY_SLICE);
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "UTF-8 does not contain 10 code points")
+ public void testSubstringInvalidStart()
+ {
+ substring(utf8Slice(STRING_HELLO), 10, 2);
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "UTF-8 does not contain 7 code points")
+ public void testSubstringInvalidLength()
+ {
+ substring(utf8Slice(STRING_HELLO), 0, 7);
+ }
+
+ @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 is not well formed")
+ public void testSubstringInvalidUtf8()
+ {
+ substring(wrappedBuffer((byte) 'f', (byte) 'o', (byte) 'o', START_3_BYTE, CONTINUATION_BYTE), 0, 4);
+ }
+
+ @Test
+ public void testReverse()
+ {
+ assertReverse(STRING_HELLO);
+ assertReverse(STRING_QUADRATICALLY);
+ assertReverse(STRING_OESTERREICH);
+ assertReverse(STRING_DULIOE_DULIOE);
+ assertReverse(STRING_FAITH_HOPE_LOVE);
+ assertReverse(STRING_NAIVE);
+ assertReverse(STRING_OO);
+ assertReverse(STRING_ASCII_CODE_POINTS);
+ assertReverse(STRING_ALL_CODE_POINTS);
+
+ INVALID_SEQUENCES.forEach(TestSliceUtf8::assertReverseWithInvalidSequence);
+ }
+
+ private static void assertReverse(String string)
+ {
+ Slice actualReverse = reverse(utf8Slice(string));
+
+ int[] codePoints = string.codePoints().toArray();
+ codePoints = Ints.toArray(Lists.reverse(Ints.asList(codePoints)));
+ Slice expectedReverse = wrappedBuffer(new String(codePoints, 0, codePoints.length).getBytes(UTF_8));
+
+ assertEquals(actualReverse, expectedReverse);
+ }
+
+ private static void assertReverseWithInvalidSequence(byte[] invalidSequence)
+ {
+ assertEquals(
+ reverse(wrappedBuffer(invalidSequence)),
+ wrappedBuffer(invalidSequence));
+ assertEquals(
+ reverse(wrappedBuffer(concat(new byte[] {'a', 'b', 'c'}, invalidSequence))),
+ wrappedBuffer(concat(invalidSequence, new byte[] {'c', 'b', 'a'})));
+ assertEquals(
+ reverse(wrappedBuffer(concat(invalidSequence, new byte[] {'x', 'y', 'z'}))),
+ wrappedBuffer(concat(new byte[] {'z', 'y', 'x'}, invalidSequence)));
+ assertEquals(
+ reverse(wrappedBuffer(concat(new byte[] {'a', 'b', 'c'}, invalidSequence, new byte[] {'x', 'y', 'z'}))),
+ wrappedBuffer(concat(new byte[] {'z', 'y', 'x'}, invalidSequence, new byte[] {'c', 'b', 'a'})));
+ }
+
+ @Test
+ public void testIsAscii()
+ {
+ assertTrue(isAscii(utf8Slice(STRING_HELLO)));
+ assertTrue(isAscii(utf8Slice(STRING_QUADRATICALLY)));
+ assertFalse(isAscii(utf8Slice(STRING_OESTERREICH)));
+ assertFalse(isAscii(utf8Slice(STRING_DULIOE_DULIOE)));
+ assertFalse(isAscii(utf8Slice(STRING_FAITH_HOPE_LOVE)));
+ assertFalse(isAscii(utf8Slice(STRING_NAIVE)));
+ assertFalse(isAscii(utf8Slice(STRING_OO)));
+ assertTrue(isAscii(utf8Slice(STRING_ASCII_CODE_POINTS)));
+ assertFalse(isAscii(utf8Slice(STRING_ALL_CODE_POINTS)));
+ }
+
+ @Test
+ public void testFixInvalidUtf8()
+ {
+ assertFixInvalidUtf8(utf8Slice(STRING_OESTERREICH), utf8Slice(STRING_OESTERREICH));
+ assertFixInvalidUtf8(utf8Slice(STRING_HELLO), utf8Slice(STRING_HELLO));
+ assertFixInvalidUtf8(utf8Slice(STRING_QUADRATICALLY), utf8Slice(STRING_QUADRATICALLY));
+ assertFixInvalidUtf8(utf8Slice(STRING_OESTERREICH), utf8Slice(STRING_OESTERREICH));
+ assertFixInvalidUtf8(utf8Slice(STRING_DULIOE_DULIOE), utf8Slice(STRING_DULIOE_DULIOE));
+ assertFixInvalidUtf8(utf8Slice(STRING_FAITH_HOPE_LOVE), utf8Slice(STRING_FAITH_HOPE_LOVE));
+ assertFixInvalidUtf8(utf8Slice(STRING_NAIVE), utf8Slice(STRING_NAIVE));
+ assertFixInvalidUtf8(utf8Slice(STRING_OO), utf8Slice(STRING_OO));
+ assertFixInvalidUtf8(utf8Slice(STRING_ASCII_CODE_POINTS), utf8Slice(STRING_ASCII_CODE_POINTS));
+ assertFixInvalidUtf8(utf8Slice(STRING_ALL_CODE_POINTS), utf8Slice(STRING_ALL_CODE_POINTS));
+
+ // max valid value for 2, 3, and 4 byte sequences
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_2_BYTE, CONTINUATION_BYTE), utf8Slice("X\u07FF"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_3_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFF"));
+ // 4 byte sequence is limited to U+10FFFF by RFC 3629
+ assertFixInvalidUtf8(
+ wrappedBuffer(X_CHAR, (byte) 0xF4, (byte) 0x8F, CONTINUATION_BYTE, CONTINUATION_BYTE),
+ wrappedBuffer(X_CHAR, (byte) 0xF4, (byte) 0x8F, CONTINUATION_BYTE, CONTINUATION_BYTE));
+
+ // 4 byte sequence is limited to 10FFFF
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+
+ // 5 and 6 byte sequences are always invalid
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+
+ // continuation byte alone is invalid
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, INVALID_FE_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, INVALID_FF_BYTE), utf8Slice("X\uFFFD"));
+
+ // sequences with not enough continuation bytes, but enough bytes
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_2_BYTE, X_CHAR), utf8Slice("X\uFFFDX"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_3_BYTE, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXX"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_3_BYTE, CONTINUATION_BYTE, X_CHAR), utf8Slice("X\uFFFDX"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_4_BYTE, X_CHAR, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXXX"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_4_BYTE, CONTINUATION_BYTE, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXX"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, X_CHAR), utf8Slice("X\uFFFDX"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE, X_CHAR, X_CHAR, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXXXX"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE, CONTINUATION_BYTE, X_CHAR, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXXX"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXX"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, X_CHAR), utf8Slice("X\uFFFDX"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, X_CHAR, X_CHAR, X_CHAR, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXXXXX"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, X_CHAR, X_CHAR, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXXXX"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, X_CHAR, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXXX"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXX"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, X_CHAR), utf8Slice("X\uFFFDX"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, INVALID_FE_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, INVALID_FF_BYTE), utf8Slice("X\uFFFD"));
+
+ // truncated sequences
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_2_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_3_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_3_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_4_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_4_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, INVALID_FE_BYTE), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, INVALID_FF_BYTE), utf8Slice("X\uFFFD"));
+ // min and max surrogate characters
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, (byte) 0b11101101, (byte) 0xA0, (byte) 0x80), utf8Slice("X\uFFFD"));
+ assertFixInvalidUtf8(wrappedBuffer(X_CHAR, (byte) 0b11101101, (byte) 0xBF, (byte) 0xBF), utf8Slice("X\uFFFD"));
+ }
+
+ private static void assertFixInvalidUtf8(Slice testSlice, Slice expectedSlice)
+ {
+ assertEquals(fixInvalidUtf8(testSlice), expectedSlice);
+ }
+
+ @Test
+ public void testCaseChange()
+ {
+ assertCaseChange(STRING_ALL_CODE_POINTS);
+ assertCaseChange(STRING_FAITH_HOPE_LOVE);
+ assertCaseChange(STRING_HELLO);
+ assertCaseChange(STRING_QUADRATICALLY);
+ assertCaseChange(STRING_OESTERREICH);
+ assertCaseChange(STRING_DULIOE_DULIOE);
+ assertCaseChange(STRING_FAITH_HOPE_LOVE);
+ assertCaseChange(STRING_NAIVE);
+ assertCaseChange(STRING_OO);
+ assertCaseChange(STRING_ASCII_CODE_POINTS);
+ assertCaseChange(STRING_ALL_CODE_POINTS);
+ assertCaseChange(STRING_ALL_CODE_POINTS_RANDOM);
+
+ toLowerCase(Slices.wrappedBuffer(INVALID_SEQUENCE_TO_LOWER_EXPANDS));
+
+ INVALID_SEQUENCES.forEach(TestSliceUtf8::assertCaseChangeWithInvalidSequence);
+ }
+
+ private static void assertCaseChangeWithInvalidSequence(byte[] invalidSequence)
+ {
+ assertEquals(
+ toLowerCase(wrappedBuffer(invalidSequence)),
+ wrappedBuffer(invalidSequence));
+ assertEquals(
+ toUpperCase(wrappedBuffer(invalidSequence)),
+ wrappedBuffer(invalidSequence));
+
+ assertEquals(
+ toLowerCase(wrappedBuffer(concat(new byte[] {'F', 'O', 'O'}, invalidSequence))),
+ wrappedBuffer(concat(new byte[] {'f', 'o', 'o'}, invalidSequence)));
+ assertEquals(
+ toUpperCase(wrappedBuffer(concat(new byte[] {'f', 'o', 'o'}, invalidSequence))),
+ wrappedBuffer(concat(new byte[] {'F', 'O', 'O'}, invalidSequence)));
+
+ assertEquals(
+ toLowerCase(wrappedBuffer(concat(invalidSequence, new byte[] {'F', 'O', 'O'}))),
+ wrappedBuffer(concat(invalidSequence, new byte[] {'f', 'o', 'o'})));
+ assertEquals(
+ toUpperCase(wrappedBuffer(concat(invalidSequence, new byte[] {'f', 'o', 'o'}))),
+ wrappedBuffer(concat(invalidSequence, new byte[] {'F', 'O', 'O'})));
+
+ assertEquals(
+ toLowerCase(wrappedBuffer(concat(new byte[] {'F', 'O', 'O'}, invalidSequence, new byte[] {'B', 'A', 'R'}))),
+ wrappedBuffer(concat(new byte[] {'f', 'o', 'o'}, invalidSequence, new byte[] {'b', 'a', 'r'})));
+ assertEquals(
+ toUpperCase(wrappedBuffer(concat(new byte[] {'f', 'o', 'o'}, invalidSequence, new byte[] {'b', 'a', 'r'}))),
+ wrappedBuffer(concat(new byte[] {'F', 'O', 'O'}, invalidSequence, new byte[] {'B', 'A', 'R'})));
+
+ }
+
+ private static void assertCaseChange(String string)
+ {
+ String expectedLower = lowerByCodePoint(string);
+ Slice actualLower = toLowerCase(utf8Slice(string));
+ assertEquals(actualLower, wrappedBuffer(expectedLower.getBytes(UTF_8)));
+
+ String expectedUpper = upperByCodePoint(string);
+ Slice actualUpper = toUpperCase(utf8Slice(string));
+ assertEquals(actualUpper, wrappedBuffer(expectedUpper.getBytes(UTF_8)));
+
+ // lower the upper and upper the lower
+ // NOTE: not all code points roundtrip, so calculate the expected
+ assertEquals(toLowerCase(actualUpper), wrappedBuffer(lowerByCodePoint(expectedUpper).getBytes(UTF_8)));
+ assertEquals(toUpperCase(actualLower), wrappedBuffer(upperByCodePoint(expectedLower).getBytes(UTF_8)));
+ }
+
+ private static String lowerByCodePoint(String string)
+ {
+ int[] upperCodePoints = string.codePoints().map(Character::toLowerCase).toArray();
+ return new String(upperCodePoints, 0, upperCodePoints.length);
+ }
+
+ private static String upperByCodePoint(String string)
+ {
+ int[] upperCodePoints = string.codePoints().map(Character::toUpperCase).toArray();
+ return new String(upperCodePoints, 0, upperCodePoints.length);
+ }
+
+ @Test
+ public void testLeftTrim()
+ {
+ assertLeftTrim("");
+ assertLeftTrim("hello");
+ assertLeftTrim("hello world");
+ assertLeftTrim("hello world ");
+
+ INVALID_SEQUENCES.forEach(TestSliceUtf8::assertLeftTrim);
+ }
+
+ private static void assertLeftTrim(String string)
+ {
+ assertLeftTrim(string.getBytes(UTF_8));
+ }
+
+ private static void assertLeftTrim(byte[] sequence)
+ {
+ assertEquals(leftTrim(wrappedBuffer(sequence)), wrappedBuffer(sequence));
+ for (int codePoint : ALL_CODE_POINTS) {
+ if (Character.isWhitespace(codePoint)) {
+ byte[] whitespace = new String(new int[] {codePoint}, 0, 1).getBytes(UTF_8);
+ assertEquals(leftTrim(wrappedBuffer(concat(whitespace, sequence))), wrappedBuffer(sequence));
+ assertEquals(leftTrim(wrappedBuffer(concat(whitespace, new byte[] {'\r', '\n', '\t', ' '}, whitespace, sequence))), wrappedBuffer(sequence));
+ }
+ }
+ }
+
+ @Test
+ public void testRightTrim()
+ {
+ assertRightTrim("");
+ assertRightTrim("hello");
+ assertRightTrim("hello world");
+ assertRightTrim(" hello world");
+
+ INVALID_SEQUENCES.forEach(TestSliceUtf8::assertRightTrim);
+ }
+
+ private static void assertRightTrim(String string)
+ {
+ assertRightTrim(string.getBytes(UTF_8));
+ }
+
+ private static void assertRightTrim(byte[] sequence)
+ {
+ assertEquals(rightTrim(wrappedBuffer(sequence)), wrappedBuffer(sequence));
+ for (int codePoint : ALL_CODE_POINTS) {
+ if (Character.isWhitespace(codePoint)) {
+ byte[] whitespace = new String(new int[] {codePoint}, 0, 1).getBytes(UTF_8);
+ assertEquals(rightTrim(wrappedBuffer(concat(sequence, whitespace))), wrappedBuffer(sequence));
+ assertEquals(rightTrim(wrappedBuffer(concat(sequence, whitespace, new byte[] {'\r', '\n', '\t', ' '}, whitespace))), wrappedBuffer(sequence));
+ }
+ }
+ }
+
+ @Test
+ public void testTrim()
+ {
+ assertTrim("");
+ assertTrim("hello");
+ assertTrim("hello world");
+
+ INVALID_SEQUENCES.forEach(TestSliceUtf8::assertTrim);
+ }
+
+ private static void assertTrim(String string)
+ {
+ assertTrim(string.getBytes(UTF_8));
+ }
+
+ private static void assertTrim(byte[] sequence)
+ {
+ assertEquals(trim(wrappedBuffer(sequence)), wrappedBuffer(sequence));
+ for (int codePoint : ALL_CODE_POINTS) {
+ if (Character.isWhitespace(codePoint)) {
+ byte[] whitespace = new String(new int[] {codePoint}, 0, 1).getBytes(UTF_8);
+ assertEquals(trim(wrappedBuffer(concat(whitespace, sequence, whitespace))), wrappedBuffer(sequence));
+ assertEquals(
+ trim(wrappedBuffer(concat(whitespace, new byte[] {'\r', '\n', '\t', ' '}, whitespace, sequence, whitespace, new byte[] {'\r', '\n', '\t', ' '}, whitespace))),
+ wrappedBuffer(sequence));
+ }
+ }
+ }
+
+ /**
+ * Test invalid UTF8 encodings. We do not expect a 'correct' but none harmful result.
+ */
+ @Test
+ public void testInvalidUtf8()
+ {
+ assertEquals(countCodePoints(wrappedBuffer(INVALID_UTF8_1)), 0);
+ assertEquals(countCodePoints(wrappedBuffer(INVALID_UTF8_2)), 3);
+
+ assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_1), 0), 0);
+ assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_1), 1), -1);
+
+ assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 0), 0);
+ assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 1), 2);
+ assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 2), 3);
+ assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 3), -1);
+ }
+
+ @Test
+ public void testLengthOfCodePoint()
+ {
+ assertEquals(lengthOfCodePointFromStartByte(START_1_BYTE), 1);
+ assertEquals(lengthOfCodePointFromStartByte(START_2_BYTE), 2);
+ assertEquals(lengthOfCodePointFromStartByte(START_3_BYTE), 3);
+ assertEquals(lengthOfCodePointFromStartByte(START_4_BYTE), 4);
+
+ for (int codePoint : ALL_CODE_POINTS) {
+ String string = new String(new int[] {codePoint}, 0, 1);
+ assertEquals(string.codePoints().count(), 1);
+
+ Slice utf8 = wrappedBuffer(string.getBytes(UTF_8));
+ assertEquals(lengthOfCodePoint(codePoint), utf8.length());
+ assertEquals(lengthOfCodePoint(utf8, 0), utf8.length());
+ assertEquals(lengthOfCodePointSafe(utf8, 0), utf8.length());
+ assertEquals(lengthOfCodePointFromStartByte(utf8.getByte(0)), utf8.length());
+
+ assertEquals(getCodePointAt(utf8, 0), codePoint);
+ assertEquals(getCodePointBefore(utf8, utf8.length()), codePoint);
+
+ assertEquals(codePointToUtf8(codePoint), utf8);
+ }
+
+ for (byte[] sequence : INVALID_SEQUENCES) {
+ assertEquals(lengthOfCodePointSafe(wrappedBuffer(sequence), 0), sequence.length);
+ assertEquals(lengthOfCodePointSafe(wrappedBuffer(concat(new byte[] {'x'}, sequence)), 1), sequence.length);
+ assertEquals(lengthOfCodePointSafe(wrappedBuffer(concat(sequence, new byte[] {'x'})), 0), sequence.length);
+ }
+ }
+
+ @Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0xFFFFFFFF")
+ public void testLengthOfNegativeCodePoint()
+ {
+ lengthOfCodePoint(-1);
+ }
+
+ @Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0x110000")
+ public void testLengthOfOutOfRangeCodePoint()
+ {
+ lengthOfCodePoint(MAX_CODE_POINT + 1);
+ }
+
+ @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "Illegal start 0xBF of code point")
+ public void testLengthOfCodePointContinuationByte()
+ {
+ lengthOfCodePointFromStartByte(CONTINUATION_BYTE);
+ }
+
+ @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "Illegal start 0xFB of code point")
+ public void testLengthOfCodePoint5ByteSequence()
+ {
+ lengthOfCodePointFromStartByte(START_5_BYTE);
+ }
+
+ @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "Illegal start 0xFD of code point")
+ public void testLengthOfCodePoint6ByteByte()
+ {
+ lengthOfCodePointFromStartByte(START_6_BYTE);
+ }
+
+ @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "Illegal start 0xFE of code point")
+ public void testLengthOfCodePointFEByte()
+ {
+ lengthOfCodePointFromStartByte(INVALID_FE_BYTE);
+ }
+
+ @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "Illegal start 0xFF of code point")
+ public void testLengthOfCodePointFFByte()
+ {
+ lengthOfCodePointFromStartByte(INVALID_FF_BYTE);
+ }
+
+ @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 sequence truncated")
+ public void testCodePointAtTruncated2()
+ {
+ getCodePointAt(wrappedBuffer((byte) 'x', START_2_BYTE), 1);
+ }
+
+ @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 sequence truncated")
+ public void testCodePointAtTruncated3()
+ {
+ getCodePointAt(wrappedBuffer((byte) 'x', START_3_BYTE, CONTINUATION_BYTE), 1);
+ }
+
+ @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 sequence truncated")
+ public void testCodePointAtTruncated4()
+ {
+ getCodePointAt(wrappedBuffer((byte) 'x', START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), 1);
+ }
+
+ @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "Illegal start 0xFB of code point")
+ public void testCodePointAt5ByteSequence()
+ {
+ getCodePointAt(wrappedBuffer((byte) 'x', START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), 1);
+ }
+
+ @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 is not well formed")
+ public void testCodePointBefore5ByteSequence()
+ {
+ getCodePointBefore(wrappedBuffer((byte) 'x', START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), 6);
+ }
+
+ @Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0xFFFFFFFF")
+ public void testSetNegativeCodePoint()
+ {
+ setCodePointAt(-1, Slices.allocate(8), 0);
+ }
+ @Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0xD800")
+ public void testSetSurrogateCodePoint()
+ {
+ setCodePointAt(MIN_SURROGATE, Slices.allocate(8), 0);
+ }
+
+ @Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0x110000")
+ public void testSetOutOfRangeCodePoint()
+ {
+ setCodePointAt(MAX_CODE_POINT + 1, Slices.allocate(8), 0);
+ }
+
+ @Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0xFFFFFFBF")
+ public void testSetCodePointContinuationByte()
+ {
+ setCodePointAt(CONTINUATION_BYTE, Slices.allocate(8), 0);
+ }
+
+}
View it on GitLab: https://salsa.debian.org/java-team/airlift-slice/-/commit/0b1214952e2c6fe46bd5df5aeba321f4af3cc7aa
--
View it on GitLab: https://salsa.debian.org/java-team/airlift-slice/-/commit/0b1214952e2c6fe46bd5df5aeba321f4af3cc7aa
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/pkg-java-commits/attachments/20250116/07ca50af/attachment.htm>
More information about the pkg-java-commits
mailing list