[Git][java-team/airlift-slice][upstream] New upstream version 0.16

Thu Jan 16 15:13:23 GMT 2025


Andreas Tille pushed to branch upstream at Debian Java Maintainers / airlift-slice


Commits:
0b121495 by Andreas Tille at 2025-01-16T15:57:57+01:00
New upstream version 0.16
- - - - -


12 changed files:

- .travis.yml
- pom.xml
- + src/main/java/io/airlift/slice/InvalidCodePointException.java
- + src/main/java/io/airlift/slice/InvalidUtf8Exception.java
- src/main/java/io/airlift/slice/Slice.java
- + src/main/java/io/airlift/slice/SliceUtf8.java
- src/main/java/io/airlift/slice/Slices.java
- src/main/java/io/airlift/slice/UnsafeSliceFactory.java
- src/main/java/io/airlift/slice/XxHash64.java
- + src/test/java/io/airlift/slice/SliceUtf8Benchmark.java
- src/test/java/io/airlift/slice/TestSlice.java
- + src/test/java/io/airlift/slice/TestSliceUtf8.java


Changes:

=====================================
.travis.yml
=====================================
@@ -1 +1,4 @@
 language: java
+
+jdk:
+  - oraclejdk8


=====================================
pom.xml
=====================================
@@ -3,7 +3,7 @@
     <modelVersion>4.0.0</modelVersion>
 
     <artifactId>slice</artifactId>
-    <version>0.10</version>
+    <version>0.16</version>
     <packaging>jar</packaging>
 
     <name>slice</name>
@@ -13,7 +13,7 @@
     <parent>
         <groupId>io.airlift</groupId>
         <artifactId>airbase</artifactId>
-        <version>31</version>
+        <version>38</version>
     </parent>
 
     <inceptionYear>2012</inceptionYear>
@@ -21,6 +21,7 @@
     <properties>
         <air.check.skip-extended>true</air.check.skip-extended>
         <air.check.skip-license>false</air.check.skip-license>
+        <air.javadoc.lint>-missing</air.javadoc.lint>
         <dep.jmh.version>0.9.4</dep.jmh.version>
     </properties>
 
@@ -78,6 +79,6 @@
     </build>
 
   <scm>
-    <tag>0.10</tag>
+    <tag>0.16</tag>
   </scm>
 </project>


=====================================
src/main/java/io/airlift/slice/InvalidCodePointException.java
=====================================
@@ -0,0 +1,33 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.airlift.slice;
+
+import static java.lang.Integer.toHexString;
+
+public class InvalidCodePointException
+        extends IllegalArgumentException
+{
+    private final int codePoint;
+
+    public InvalidCodePointException(int codePoint)
+    {
+        super("Invalid code point 0x" + toHexString(codePoint).toUpperCase());
+        this.codePoint = codePoint;
+    }
+
+    public int getCodePoint()
+    {
+        return codePoint;
+    }
+}


=====================================
src/main/java/io/airlift/slice/InvalidUtf8Exception.java
=====================================
@@ -0,0 +1,23 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.airlift.slice;
+
+public class InvalidUtf8Exception
+        extends IllegalArgumentException
+{
+    public InvalidUtf8Exception(String message)
+    {
+        super(message);
+    }
+}


=====================================
src/main/java/io/airlift/slice/Slice.java
=====================================
@@ -35,6 +35,7 @@ import static io.airlift.slice.SizeOf.SIZE_OF_INT;
 import static io.airlift.slice.SizeOf.SIZE_OF_LONG;
 import static io.airlift.slice.SizeOf.SIZE_OF_SHORT;
 import static io.airlift.slice.StringDecoder.decodeString;
+import static java.lang.Math.min;
 import static java.lang.String.format;
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static sun.misc.Unsafe.ARRAY_BOOLEAN_BASE_OFFSET;
@@ -74,7 +75,7 @@ public final class Slice
      * this slice; otherwise, address is the offset from the base object.
      * This base plus relative offset addressing is taken directly from
      * the Unsafe interface.
-     * <p/>
+     * <p>
      * Note: if base object is a byte array, this address ARRAY_BYTE_BASE_OFFSET,
      * since the byte array data starts AFTER the byte array object header.
      */
@@ -85,6 +86,11 @@ public final class Slice
      */
     private final int size;
 
+    /**
+     * Bytes retained by the slice
+     */
+    private final int retainedSize;
+
     /**
      * Reference is typically a ByteBuffer object, but can be any object this
      * slice must hold onto to assure that the underlying memory is not
@@ -102,6 +108,7 @@ public final class Slice
         this.base = null;
         this.address = 0;
         this.size = 0;
+        this.retainedSize = 0;
         this.reference = null;
     }
 
@@ -114,6 +121,7 @@ public final class Slice
         this.base = base;
         this.address = ARRAY_BYTE_BASE_OFFSET;
         this.size = base.length;
+        this.retainedSize = base.length;
         this.reference = null;
     }
 
@@ -128,6 +136,7 @@ public final class Slice
         this.base = base;
         this.address = ARRAY_BYTE_BASE_OFFSET + offset;
         this.size = length;
+        this.retainedSize = base.length;
         this.reference = null;
     }
 
@@ -142,6 +151,7 @@ public final class Slice
         this.base = base;
         this.address = ARRAY_BOOLEAN_BASE_OFFSET + offset;
         this.size = length * ARRAY_BOOLEAN_INDEX_SCALE;
+        this.retainedSize = base.length * ARRAY_BOOLEAN_INDEX_SCALE;
         this.reference = null;
     }
 
@@ -156,6 +166,7 @@ public final class Slice
         this.base = base;
         this.address = ARRAY_SHORT_BASE_OFFSET + offset;
         this.size = length * ARRAY_SHORT_INDEX_SCALE;
+        this.retainedSize = base.length * ARRAY_SHORT_INDEX_SCALE;
         this.reference = null;
     }
 
@@ -170,6 +181,7 @@ public final class Slice
         this.base = base;
         this.address = ARRAY_INT_BASE_OFFSET + offset;
         this.size = length * ARRAY_INT_INDEX_SCALE;
+        this.retainedSize = base.length * ARRAY_INT_INDEX_SCALE;
         this.reference = null;
     }
 
@@ -184,6 +196,7 @@ public final class Slice
         this.base = base;
         this.address = ARRAY_LONG_BASE_OFFSET + offset;
         this.size = length * ARRAY_LONG_INDEX_SCALE;
+        this.retainedSize = base.length * ARRAY_LONG_INDEX_SCALE;
         this.reference = null;
     }
 
@@ -198,6 +211,7 @@ public final class Slice
         this.base = base;
         this.address = ARRAY_FLOAT_BASE_OFFSET + offset;
         this.size = length * ARRAY_FLOAT_INDEX_SCALE;
+        this.retainedSize = base.length * ARRAY_FLOAT_INDEX_SCALE;
         this.reference = null;
     }
 
@@ -212,13 +226,14 @@ public final class Slice
         this.base = base;
         this.address = ARRAY_DOUBLE_BASE_OFFSET + offset;
         this.size = length * ARRAY_DOUBLE_INDEX_SCALE;
+        this.retainedSize = base.length * ARRAY_DOUBLE_INDEX_SCALE;
         this.reference = null;
     }
 
     /**
      * Creates a slice for directly accessing the base object.
      */
-    Slice(@Nullable Object base, long address, int size, @Nullable Object reference)
+    Slice(@Nullable Object base, long address, int size, int retainedSize, @Nullable Object reference)
     {
         if (address <= 0) {
             throw new IllegalArgumentException(format("Invalid address: %s", address));
@@ -232,6 +247,7 @@ public final class Slice
         this.base = base;
         this.address = address;
         this.size = size;
+        this.retainedSize = retainedSize;
     }
 
     /**
@@ -260,6 +276,14 @@ public final class Slice
         return size;
     }
 
+    /**
+     * Approximate number of bytes retained by this slice.
+     */
+    public int getRetainedSize()
+    {
+        return retainedSize;
+    }
+
     /**
      * Fill the slice with the specified value;
      */
@@ -313,6 +337,11 @@ public final class Slice
     public byte getByte(int index)
     {
         checkIndexLength(index, SIZE_OF_BYTE);
+        return getByteUnchecked(index);
+    }
+
+    byte getByteUnchecked(int index)
+    {
         return unsafe.getByte(base, address + index);
     }
 
@@ -338,6 +367,11 @@ public final class Slice
     public short getShort(int index)
     {
         checkIndexLength(index, SIZE_OF_SHORT);
+        return getShortUnchecked(index);
+    }
+
+    short getShortUnchecked(int index)
+    {
         return unsafe.getShort(base, address + index);
     }
 
@@ -351,6 +385,11 @@ public final class Slice
     public int getInt(int index)
     {
         checkIndexLength(index, SIZE_OF_INT);
+        return getIntUnchecked(index);
+    }
+
+    public int getIntUnchecked(int index)
+    {
         return unsafe.getInt(base, address + index);
     }
 
@@ -364,6 +403,11 @@ public final class Slice
     public long getLong(int index)
     {
         checkIndexLength(index, SIZE_OF_LONG);
+        return getLongUnchecked(index);
+    }
+
+    long getLongUnchecked(int index)
+    {
         return unsafe.getLong(base, address + index);
     }
 
@@ -494,9 +538,14 @@ public final class Slice
     {
         checkIndexLength(index, length);
 
+        if (base instanceof byte[]) {
+            out.write((byte[]) base, (int) ((address - ARRAY_BYTE_BASE_OFFSET) + index), length);
+            return;
+        }
+
         byte[] buffer = new byte[4096];
         while (length > 0) {
-            int size = Math.min(buffer.length, length);
+            int size = min(buffer.length, length);
             getBytes(index, buffer, 0, size);
             out.write(buffer, 0, size);
             length -= size;
@@ -514,6 +563,11 @@ public final class Slice
     public void setByte(int index, int value)
     {
         checkIndexLength(index, SIZE_OF_BYTE);
+        setByteUnchecked(index, value);
+    }
+
+    void setByteUnchecked(int index, int value)
+    {
         unsafe.putByte(base, address + index, (byte) (value & 0xFF));
     }
 
@@ -528,6 +582,11 @@ public final class Slice
     public void setShort(int index, int value)
     {
         checkIndexLength(index, SIZE_OF_SHORT);
+        setShortUnchecked(index, value);
+    }
+
+    void setShortUnchecked(int index, int value)
+    {
         unsafe.putShort(base, address + index, (short) (value & 0xFFFF));
     }
 
@@ -541,6 +600,11 @@ public final class Slice
     public void setInt(int index, int value)
     {
         checkIndexLength(index, SIZE_OF_INT);
+        setIntUnchecked(index, value);
+    }
+
+    void setIntUnchecked(int index, int value)
+    {
         unsafe.putInt(base, address + index, value);
     }
 
@@ -658,7 +722,7 @@ public final class Slice
         byte[] bytes = new byte[4096];
 
         while (length > 0) {
-            int bytesRead = in.read(bytes, 0, Math.min(bytes.length, length));
+            int bytesRead = in.read(bytes, 0, min(bytes.length, length));
             if (bytesRead < 0) {
                 throw new IndexOutOfBoundsException("End of stream");
             }
@@ -681,20 +745,119 @@ public final class Slice
         if (length == 0) {
             return Slices.EMPTY_SLICE;
         }
-        return new Slice(base, address + index, length, reference);
+        return new Slice(base, address + index, length, retainedSize, reference);
     }
 
     public int indexOfByte(int b)
     {
         b = b & 0xFF;
         for (int i = 0; i < size; i++) {
-            if (unsafe.getByte(base, address + i) == b) {
+            if (getByteUnchecked(i) == b) {
                 return i;
             }
         }
         return -1;
     }
 
+    /**
+     * Returns the index of the first occurrence of the pattern with this slice.
+     * If the pattern is not found -1 is returned. If patten is empty, zero is
+     * returned.
+     */
+    public int indexOf(Slice slice)
+    {
+        return indexOf(slice, 0);
+    }
+
+    /**
+     * Returns the index of the first occurrence of the pattern with this slice.
+     * If the pattern is not found -1 is returned If patten is empty, the offset
+     * is returned.
+     */
+    public int indexOf(Slice pattern, int offset)
+    {
+        if (size == 0 || offset >= size) {
+            return -1;
+        }
+
+        if (pattern.length() == 0) {
+            return offset;
+        }
+
+        // Do we have enough characters
+        if (pattern.length() < SIZE_OF_INT || size < SIZE_OF_LONG) {
+            return indexOfBruteForce(pattern, offset);
+        }
+
+        // Using first four bytes for faster search. We are not using eight bytes for long
+        // because we want more strings to get use of fast search.
+        int head = pattern.getIntUnchecked(0);
+
+        // Take the first byte of head for faster skipping
+        int firstByteMask = head & 0xff;
+        firstByteMask |= firstByteMask << 8;
+        firstByteMask |= firstByteMask << 16;
+
+        int lastValidIndex = size - pattern.length();
+        int index = offset;
+        while (index <= lastValidIndex) {
+            // Read four bytes in sequence
+            int value = getIntUnchecked(index);
+
+            // Compare all bytes of value with first byte of search data
+            // see https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+            int valueXor = value ^ firstByteMask;
+            int hasZeroBytes = (valueXor - 0x01010101) & ~valueXor & 0x80808080;
+
+            // If valueXor doesn't not have any zero byte then there is no match and we can advance
+            if (hasZeroBytes == 0) {
+                index += SIZE_OF_INT;
+                continue;
+            }
+
+            // Try fast match of head and the rest
+            if (value == head && equalsUnchecked(index, pattern, 0, pattern.length())) {
+                return index;
+            }
+
+            index++;
+        }
+
+        return -1;
+    }
+
+    int indexOfBruteForce(Slice pattern, int offset)
+    {
+        if (size == 0 || offset >= size) {
+            return -1;
+        }
+
+        if (pattern.length() == 0) {
+            return offset;
+        }
+
+        byte firstByte = pattern.getByteUnchecked(0);
+        int lastValidIndex = size - pattern.length();
+        int index = offset;
+        while (true) {
+            // seek to first byte match
+            while (index < lastValidIndex && getByteUnchecked(index) != firstByte) {
+                index++;
+            }
+            if (index > lastValidIndex) {
+                break;
+            }
+
+            if (equalsUnchecked(index, pattern, 0, pattern.length())) {
+                return index;
+            }
+
+            index++;
+        }
+
+        return -1;
+    }
+
     /**
      * Compares the content of the specified buffer to the content of this
      * buffer.  This comparison is performed byte by byte using an unsigned
@@ -724,11 +887,11 @@ public final class Slice
         checkIndexLength(offset, length);
         that.checkIndexLength(otherOffset, otherLength);
 
-        int compareLength = Math.min(length, otherLength);
+        int compareLength = min(length, otherLength);
         while (compareLength >= SIZE_OF_LONG) {
-            long thisLong = unsafe.getLong(base, address + offset);
+            long thisLong = getLongUnchecked(offset);
             thisLong = Long.reverseBytes(thisLong);
-            long thatLong = unsafe.getLong(that.base, that.address + otherOffset);
+            long thatLong = that.getLongUnchecked(otherOffset);
             thatLong = Long.reverseBytes(thatLong);
 
             int v = compareUnsignedLongs(thisLong, thatLong);
@@ -742,8 +905,8 @@ public final class Slice
         }
 
         while (compareLength > 0) {
-            byte thisByte = unsafe.getByte(base, address + offset);
-            byte thatByte = unsafe.getByte(that.base, that.address + otherOffset);
+            byte thisByte = getByteUnchecked(offset);
+            byte thatByte = that.getByteUnchecked(otherOffset);
 
             int v = compareUnsignedBytes(thisByte, thatByte);
             if (v != 0) {
@@ -779,8 +942,8 @@ public final class Slice
         int offset = 0;
         int length = size;
         while (length >= SIZE_OF_LONG) {
-            long thisLong = unsafe.getLong(base, address + offset);
-            long thatLong = unsafe.getLong(that.base, that.address + offset);
+            long thisLong = getLongUnchecked(offset);
+            long thatLong = that.getLongUnchecked(offset);
 
             if (thisLong != thatLong) {
                 return false;
@@ -791,8 +954,8 @@ public final class Slice
         }
 
         while (length > 0) {
-            byte thisByte = unsafe.getByte(base, address + offset);
-            byte thatByte = unsafe.getByte(that.base, that.address + offset);
+            byte thisByte = getByteUnchecked(offset);
+            byte thatByte = that.getByteUnchecked(offset);
             if (thisByte != thatByte) {
                 return false;
             }
@@ -838,16 +1001,21 @@ public final class Slice
             return false;
         }
 
+        checkIndexLength(offset, length);
+        that.checkIndexLength(otherOffset, otherLength);
+
+        return equalsUnchecked(offset, that, otherOffset, length);
+    }
+
+    boolean equalsUnchecked(int offset, Slice that, int otherOffset, int length)
+    {
         if ((this == that) && (offset == otherOffset)) {
             return true;
         }
 
-        checkIndexLength(offset, length);
-        that.checkIndexLength(otherOffset, otherLength);
-
         while (length >= SIZE_OF_LONG) {
-            long thisLong = unsafe.getLong(base, address + offset);
-            long thatLong = unsafe.getLong(that.base, that.address + otherOffset);
+            long thisLong = getLongUnchecked(offset);
+            long thatLong = that.getLongUnchecked(otherOffset);
 
             if (thisLong != thatLong) {
                 return false;
@@ -859,8 +1027,8 @@ public final class Slice
         }
 
         while (length > 0) {
-            byte thisByte = unsafe.getByte(base, address + offset);
-            byte thatByte = unsafe.getByte(that.base, that.address + otherOffset);
+            byte thisByte = getByteUnchecked(offset);
+            byte thatByte = that.getByteUnchecked(otherOffset);
             if (thisByte != thatByte) {
                 return false;
             }
@@ -932,7 +1100,7 @@ public final class Slice
 
         char[] chars = new char[length];
         for (int pos = index; pos < length; pos++) {
-            chars[pos] = (char) (unsafe.getByte(base, address + pos) & 0x7F);
+            chars[pos] = (char) (getByteUnchecked(pos) & 0x7F);
         }
         return new String(chars);
     }
@@ -1029,13 +1197,13 @@ public final class Slice
     private static long fillLong(byte value)
     {
         return (value & 0xFFL) << 56
-            | (value & 0xFFL) << 48
-            | (value & 0xFFL) << 40
-            | (value & 0xFFL) << 32
-            | (value & 0xFFL) << 24
-            | (value & 0xFFL) << 16
-            | (value & 0xFFL) << 8
-            | (value & 0xFFL);
+                | (value & 0xFFL) << 48
+                | (value & 0xFFL) << 40
+                | (value & 0xFFL) << 32
+                | (value & 0xFFL) << 24
+                | (value & 0xFFL) << 16
+                | (value & 0xFFL) << 8
+                | (value & 0xFFL);
     }
 
     private static int compareUnsignedBytes(byte thisByte, byte thatByte)


=====================================
src/main/java/io/airlift/slice/SliceUtf8.java
=====================================
@@ -0,0 +1,942 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.airlift.slice;
+
+import java.util.OptionalInt;
+
+import static io.airlift.slice.Preconditions.checkArgument;
+import static io.airlift.slice.Preconditions.checkPositionIndex;
+import static io.airlift.slice.Preconditions.checkPositionIndexes;
+import static java.lang.Character.MAX_CODE_POINT;
+import static java.lang.Character.MAX_SURROGATE;
+import static java.lang.Character.MIN_SURROGATE;
+import static java.lang.Integer.toHexString;
+
+/**
+ * Utility methods for UTF-8 encoded slices.
+ */
+public final class SliceUtf8
+{
+    private SliceUtf8() {}
+
+    private static final int REPLACEMENT_CODE_POINT = 0xFFFD;
+
+    private static final int TOP_MASK32 = 0x8080_8080;
+    private static final long TOP_MASK64 = 0x8080_8080_8080_8080L;
+
+    private static final int[] LOWER_CODE_POINTS;
+    private static final int[] UPPER_CODE_POINTS;
+    private static final boolean[] WHITESPACE_CODE_POINTS;
+
+    static {
+        LOWER_CODE_POINTS = new int[MAX_CODE_POINT + 1];
+        UPPER_CODE_POINTS = new int[MAX_CODE_POINT + 1];
+        WHITESPACE_CODE_POINTS = new boolean[MAX_CODE_POINT + 1];
+        for (int codePoint = 0; codePoint <= MAX_CODE_POINT; codePoint++) {
+            int type = Character.getType(codePoint);
+            if (type != Character.SURROGATE) {
+                LOWER_CODE_POINTS[codePoint] = Character.toLowerCase(codePoint);
+                UPPER_CODE_POINTS[codePoint] = Character.toUpperCase(codePoint);
+                WHITESPACE_CODE_POINTS[codePoint] = Character.isWhitespace(codePoint);
+            }
+            else {
+                LOWER_CODE_POINTS[codePoint] = REPLACEMENT_CODE_POINT;
+                UPPER_CODE_POINTS[codePoint] = REPLACEMENT_CODE_POINT;
+                WHITESPACE_CODE_POINTS[codePoint] = false;
+            }
+        }
+    }
+
+    /**
+     * Does the slice contain only 7-bit ASCII characters.
+     */
+    public static boolean isAscii(Slice utf8)
+    {
+        int length = utf8.length();
+        int offset = 0;
+
+        // Length rounded to 8 bytes
+        int length8 = length & 0x7FFF_FFF8;
+        for (; offset < length8; offset += 8) {
+            if ((utf8.getLongUnchecked(offset) & TOP_MASK64) != 0) {
+                return false;
+            }
+        }
+        // Enough bytes left for 32 bits?
+        if (offset + 4 < length) {
+            if ((utf8.getIntUnchecked(offset) & TOP_MASK32) != 0) {
+                return false;
+            }
+
+            offset += 4;
+        }
+        // Do the rest one by one
+        for (; offset < length; offset++) {
+            if ((utf8.getByteUnchecked(offset) & 0x80) != 0) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    /**
+     * Counts the code points within UTF-8 encoded slice.
+     * <p>
+     * Note: This method does not explicitly check for valid UTF-8, and may
+     * return incorrect results or throw an exception for invalid UTF-8.
+     */
+    public static int countCodePoints(Slice utf8)
+    {
+        return countCodePoints(utf8, 0, utf8.length());
+    }
+
+    /**
+     * Counts the code points within UTF-8 encoded slice up to {@code length}.
+     * <p>
+     * Note: This method does not explicitly check for valid UTF-8, and may
+     * return incorrect results or throw an exception for invalid UTF-8.
+     */
+    public static int countCodePoints(Slice utf8, int offset, int length)
+    {
+        checkPositionIndexes(offset, offset + length, utf8.length());
+
+        // Quick exit if empty string
+        if (length == 0) {
+            return 0;
+        }
+
+        int continuationBytesCount = 0;
+        // Length rounded to 8 bytes
+        int length8 = length & 0x7FFF_FFF8;
+        for (; offset < length8; offset += 8) {
+            // Count bytes which are NOT the start of a code point
+            continuationBytesCount += countContinuationBytes(utf8.getLongUnchecked(offset));
+        }
+        // Enough bytes left for 32 bits?
+        if (offset + 4 < length) {
+            // Count bytes which are NOT the start of a code point
+            continuationBytesCount += countContinuationBytes(utf8.getIntUnchecked(offset));
+
+            offset += 4;
+        }
+        // Do the rest one by one
+        for (; offset < length; offset++) {
+            // Count bytes which are NOT the start of a code point
+            continuationBytesCount += countContinuationBytes(utf8.getByteUnchecked(offset));
+        }
+
+        assert continuationBytesCount <= length;
+        return length - continuationBytesCount;
+    }
+
+    /**
+     * Gets the substring starting at {@code codePointStart} and extending for
+     * {@code codePointLength} code points.
+     * <p>
+     * Note: This method does not explicitly check for valid UTF-8, and may
+     * return incorrect results or throw an exception for invalid UTF-8.
+     */
+    public static Slice substring(Slice utf8, int codePointStart, int codePointLength)
+    {
+        checkArgument(codePointStart >= 0, "codePointStart is negative");
+        checkArgument(codePointLength >= 0, "codePointLength is negative");
+
+        int indexStart = offsetOfCodePoint(utf8, codePointStart);
+        if (indexStart < 0) {
+            throw new IllegalArgumentException("UTF-8 does not contain " + codePointStart + " code points");
+        }
+        if (codePointLength == 0) {
+            return Slices.EMPTY_SLICE;
+        }
+        int indexEnd = offsetOfCodePoint(utf8, indexStart, codePointLength - 1);
+        if (indexEnd < 0) {
+            throw new IllegalArgumentException("UTF-8 does not contain " + (codePointStart + codePointLength) + " code points");
+        }
+        indexEnd += lengthOfCodePoint(utf8, indexEnd);
+        if (indexEnd > utf8.length()) {
+            throw new InvalidUtf8Exception("UTF-8 is not well formed");
+        }
+        return utf8.slice(indexStart, indexEnd - indexStart);
+    }
+
+    /**
+     * Reverses the slice code point by code point.
+     * <p>
+     * Note: Invalid UTF-8 sequences are copied directly to the output.
+     */
+    public static Slice reverse(Slice utf8)
+    {
+        int length = utf8.length();
+        Slice reverse = Slices.allocate(length);
+
+        int forwardPosition = 0;
+        int reversePosition = length;
+        while (forwardPosition < length) {
+            int codePointLength = lengthOfCodePointSafe(utf8, forwardPosition);
+
+            // backup the reverse pointer
+            reversePosition -= codePointLength;
+            if (reversePosition < 0) {
+                // this should not happen
+                throw new InvalidUtf8Exception("UTF-8 is not well formed");
+            }
+            // copy the character
+            copyUtf8SequenceUnsafe(utf8, forwardPosition, reverse, reversePosition, codePointLength);
+
+            forwardPosition += codePointLength;
+        }
+        return reverse;
+    }
+
+    /**
+     * Converts slice to upper case code point by code point.  This method does
+     * not perform perform locale-sensitive, context-sensitive, or one-to-many
+     * mappings required for some languages.  Specifically, this will return
+     * incorrect results for Lithuanian, Turkish, and Azeri.
+     * <p>
+     * Note: Invalid UTF-8 sequences are copied directly to the output.
+     */
+    public static Slice toUpperCase(Slice utf8)
+    {
+        return translateCodePoints(utf8, UPPER_CODE_POINTS);
+    }
+
+    /**
+     * Converts slice to lower case code point by code point.  This method does
+     * not perform perform locale-sensitive, context-sensitive, or one-to-many
+     * mappings required for some languages.  Specifically, this will return
+     * incorrect results for Lithuanian, Turkish, and Azeri.
+     * <p>
+     * Note: Invalid UTF-8 sequences are copied directly to the output.
+     */
+    public static Slice toLowerCase(Slice utf8)
+    {
+        return translateCodePoints(utf8, LOWER_CODE_POINTS);
+    }
+
+    private static Slice translateCodePoints(Slice utf8, int[] codePointTranslationMap)
+    {
+        int length = utf8.length();
+        Slice newUtf8 = Slices.allocate(length);
+
+        int position = 0;
+        int upperPosition = 0;
+        while (position < length) {
+            int codePoint = tryGetCodePointAt(utf8, position);
+            if (codePoint >= 0) {
+                int upperCodePoint = codePointTranslationMap[codePoint];
+
+                // grow slice if necessary
+                int nextUpperPosition = upperPosition + lengthOfCodePoint(upperCodePoint);
+                if (nextUpperPosition > length) {
+                    newUtf8 = Slices.ensureSize(newUtf8, nextUpperPosition);
+                }
+
+                // write new byte
+                setCodePointAt(upperCodePoint, newUtf8, upperPosition);
+
+                position += lengthOfCodePoint(codePoint);
+                upperPosition = nextUpperPosition;
+            }
+            else {
+                int skipLength = -codePoint;
+
+                // grow slice if necessary
+                int nextUpperPosition = upperPosition + skipLength;
+                if (nextUpperPosition > length) {
+                    newUtf8 = Slices.ensureSize(newUtf8, nextUpperPosition);
+                }
+
+                copyUtf8SequenceUnsafe(utf8, position, newUtf8, upperPosition, skipLength);
+                position += skipLength;
+                upperPosition = nextUpperPosition;
+            }
+        }
+        return newUtf8.slice(0, upperPosition);
+    }
+
+    private static void copyUtf8SequenceUnsafe(Slice source, int sourcePosition, Slice destination, int destinationPosition, int length)
+    {
+        switch (length) {
+            case 1:
+                destination.setByteUnchecked(destinationPosition, source.getByteUnchecked(sourcePosition));
+                break;
+            case 2:
+                destination.setShortUnchecked(destinationPosition, source.getShortUnchecked(sourcePosition));
+                break;
+            case 3:
+                destination.setShortUnchecked(destinationPosition, source.getShortUnchecked(sourcePosition));
+                destination.setByteUnchecked(destinationPosition + 2, source.getByteUnchecked(sourcePosition + 2));
+                break;
+            case 4:
+                destination.setIntUnchecked(destinationPosition, source.getIntUnchecked(sourcePosition));
+                break;
+            case 5:
+                destination.setIntUnchecked(destinationPosition, source.getIntUnchecked(sourcePosition));
+                destination.setByteUnchecked(destinationPosition + 4, source.getByteUnchecked(sourcePosition + 4));
+                break;
+            case 6:
+                destination.setIntUnchecked(destinationPosition, source.getIntUnchecked(sourcePosition));
+                destination.setShortUnchecked(destinationPosition + 4, source.getShortUnchecked(sourcePosition + 4));
+                break;
+            default:
+                throw new IllegalStateException("Invalid code point length " + length);
+        }
+    }
+
+    /**
+     * Removes all white space characters from the left string of the string.
+     * <p>
+     * Note: Invalid UTF-8 sequences are not trimmed.
+     */
+    public static Slice leftTrim(Slice utf8)
+    {
+        int length = utf8.length();
+
+        int position = firstNonWhitespacePosition(utf8);
+        return utf8.slice(position, length - position);
+    }
+
+    private static int firstNonWhitespacePosition(Slice utf8)
+    {
+        int length = utf8.length();
+
+        int position = 0;
+        while (position < length) {
+            int codePoint = tryGetCodePointAt(utf8, position);
+            if (codePoint < 0) {
+                break;
+            }
+            if (!WHITESPACE_CODE_POINTS[codePoint]) {
+                break;
+            }
+            position += lengthOfCodePoint(codePoint);
+        }
+        return position;
+    }
+
+    /**
+     * Removes all white space characters from the right side of the string.
+     * <p>
+     * Note: Invalid UTF-8 sequences are not trimmed.
+     */
+    public static Slice rightTrim(Slice utf8)
+    {
+        int position = lastNonWhitespacePosition(utf8, 0);
+        return utf8.slice(0, position);
+    }
+
+    private static int lastNonWhitespacePosition(Slice utf8, int minPosition)
+    {
+        int length = utf8.length();
+
+        int position = length;
+        while (minPosition < position) {
+            // decode the code point before position if possible
+            int codePoint;
+            byte unsignedByte = utf8.getByte(position - 1);
+            if (!isContinuationByte(unsignedByte)) {
+                codePoint = unsignedByte & 0xFF;
+            }
+            else if (minPosition <= position -2 && !isContinuationByte(utf8.getByte(position - 2))) {
+                codePoint = tryGetCodePointAt(utf8, position - 2);
+            }
+            else if (minPosition <= position -3 && !isContinuationByte(utf8.getByte(position - 3))) {
+                codePoint = tryGetCodePointAt(utf8, position - 3);
+            }
+            else if (minPosition <= position -4 && !isContinuationByte(utf8.getByte(position - 4))) {
+                codePoint = tryGetCodePointAt(utf8, position - 4);
+            }
+            else {
+                break;
+            }
+
+            if (codePoint < 0 || !WHITESPACE_CODE_POINTS[codePoint]) {
+                break;
+            }
+            position -= lengthOfCodePoint(codePoint);
+        }
+        return position;
+    }
+
+    /**
+     * Removes all white space characters from the left and right side of the string.
+     * <p>
+     * Note: Invalid UTF-8 sequences are not trimmed.
+     */
+    public static Slice trim(Slice utf8)
+    {
+        int start = firstNonWhitespacePosition(utf8);
+        int end = lastNonWhitespacePosition(utf8, start);
+        return utf8.slice(start, end - start);
+    }
+
+    public static Slice fixInvalidUtf8(Slice slice)
+    {
+        return fixInvalidUtf8(slice, OptionalInt.of(REPLACEMENT_CODE_POINT));
+    }
+
+    public static Slice fixInvalidUtf8(Slice slice, OptionalInt replacementCodePoint)
+    {
+        if (isAscii(slice)) {
+            return slice;
+        }
+
+        int replacementCodePointValue = -1;
+        int replacementCodePointLength = 0;
+        if (replacementCodePoint.isPresent()) {
+            replacementCodePointValue = replacementCodePoint.getAsInt();
+            replacementCodePointLength = lengthOfCodePoint(replacementCodePointValue);
+        }
+
+        int length = slice.length();
+        Slice utf8 = Slices.allocate(length);
+
+        int dataPosition = 0;
+        int utf8Position = 0;
+        while (dataPosition < length) {
+            int codePoint = tryGetCodePointAt(slice, dataPosition);
+            int codePointLength;
+            if (codePoint >= 0) {
+                codePointLength = lengthOfCodePoint(codePoint);
+                dataPosition += codePointLength;
+            }
+            else {
+                // negative number carries the number of invalid bytes
+                dataPosition += (-codePoint);
+                if (replacementCodePointValue < 0) {
+                    continue;
+                }
+                codePoint = replacementCodePointValue;
+                codePointLength = replacementCodePointLength;
+            }
+            utf8 = Slices.ensureSize(utf8, utf8Position + codePointLength);
+            utf8Position += setCodePointAt(codePoint, utf8, utf8Position);
+        }
+        return utf8.slice(0, utf8Position);
+    }
+
+    /**
+     * Tries to get the UTF-8 encoded code point at the {@code position}.  A positive
+     * return value means the UTF-8 sequence at the position is valid, and the result
+     * is the code point.  A negative return value means the UTF-8 sequence at the
+     * position is invalid, and the length of the invalid sequence is the absolute
+     * value of the result.
+     * @return the code point or negative the number of bytes in the invalid UTF-8 sequence.
+     */
+    public static int tryGetCodePointAt(Slice utf8, int position)
+    {
+        //
+        // Process first byte
+        byte firstByte = utf8.getByte(position);
+
+        int length = lengthOfCodePointFromStartByteSafe(firstByte);
+        if (length < 0) {
+            return length;
+        }
+
+        if (length == 1) {
+            // normal ASCII
+            // 0xxx_xxxx
+            return firstByte;
+        }
+
+        //
+        // Process second byte
+        if (position + 1 >= utf8.length()) {
+            return -1;
+        }
+
+        byte secondByte = utf8.getByteUnchecked(position + 1);
+        if (!isContinuationByte(secondByte)) {
+            return -1;
+        }
+
+        if (length == 2) {
+            // 110x_xxxx 10xx_xxxx
+            return ((firstByte & 0b0001_1111) << 6) |
+                    (secondByte & 0b0011_1111);
+        }
+
+        //
+        // Process third byte
+        if (position + 2 >= utf8.length()) {
+            return -2;
+        }
+
+        byte thirdByte = utf8.getByteUnchecked(position + 2);
+        if (!isContinuationByte(thirdByte)) {
+            return -2;
+        }
+
+        if (length == 3) {
+            // 1110_xxxx 10xx_xxxx 10xx_xxxx
+            int codePoint = ((firstByte & 0b0000_1111) << 12) |
+                    ((secondByte & 0b0011_1111) << 6) |
+                    (thirdByte & 0b0011_1111);
+
+            // surrogates are invalid
+            if (MIN_SURROGATE <= codePoint && codePoint <= MAX_SURROGATE) {
+                return -3;
+            }
+            return codePoint;
+        }
+
+        //
+        // Process forth byte
+        if (position + 3 >= utf8.length()) {
+            return -3;
+        }
+
+        byte forthByte = utf8.getByteUnchecked(position + 3);
+        if (!isContinuationByte(forthByte)) {
+            return -3;
+        }
+
+        if (length == 4) {
+            // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
+            int codePoint = ((firstByte & 0b0000_0111) << 18) |
+                    ((secondByte & 0b0011_1111) << 12) |
+                    ((thirdByte & 0b0011_1111) << 6) |
+                    (forthByte & 0b0011_1111);
+            // 4 byte code points have a limited valid range
+            if (codePoint < 0x11_0000) {
+                return codePoint;
+            }
+            return -4;
+        }
+
+        //
+        // Process fifth byte
+        if (position + 4 >= utf8.length()) {
+            return -4;
+        }
+
+        byte fifthByte = utf8.getByteUnchecked(position + 4);
+        if (!isContinuationByte(fifthByte)) {
+            return -4;
+        }
+
+        if (length == 5) {
+            // Per RFC3629, UTF-8 is limited to 4 bytes, so more bytes are illegal
+            return -5;
+        }
+
+        //
+        // Process sixth byte
+        if (position + 5 >= utf8.length()) {
+            return -5;
+        }
+
+        byte sixthByte = utf8.getByteUnchecked(position + 5);
+        if (!isContinuationByte(sixthByte)) {
+            return -5;
+        }
+
+        if (length == 6) {
+            // Per RFC3629, UTF-8 is limited to 4 bytes, so more bytes are illegal
+            return -6;
+        }
+
+        // for longer sequence, which can't happen
+        return -1;
+    }
+
+    static int lengthOfCodePointFromStartByteSafe(byte startByte)
+    {
+        int unsignedStartByte = startByte & 0xFF;
+        if (unsignedStartByte < 0b1000_0000) {
+            // normal ASCII
+            // 0xxx_xxxx
+            return 1;
+        }
+        if (unsignedStartByte < 0b1100_0000) {
+            // illegal bytes
+            // 10xx_xxxx
+            return -1;
+        }
+        if (unsignedStartByte < 0b1110_0000) {
+            // 110x_xxxx 10xx_xxxx
+            return 2;
+        }
+        if (unsignedStartByte < 0b1111_0000) {
+            // 1110_xxxx 10xx_xxxx 10xx_xxxx
+            return 3;
+        }
+        if (unsignedStartByte < 0b1111_1000) {
+            // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
+            return 4;
+        }
+        if (unsignedStartByte < 0b1111_1100) {
+            // 1111_10xx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
+            return 5;
+        }
+        if (unsignedStartByte < 0b1111_1110) {
+            // 1111_110x 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
+            return 6;
+        }
+        return -1;
+    }
+
+    /**
+     * Finds the index of the first byte of the code point at a position, or
+     * {@code -1} if the position is not withing the slice.
+     * <p>
+     * Note: This method does not explicitly check for valid UTF-8, and may
+     * return incorrect results or throw an exception for invalid UTF-8.
+     */
+    public static int offsetOfCodePoint(Slice utf8, int codePointCount)
+    {
+        return offsetOfCodePoint(utf8, 0, codePointCount);
+    }
+
+    /**
+     * Starting from {@code position} bytes in {@code utf8}, finds the
+     * index of the first byte of the code point {@code codePointCount}
+     * in the slice.  If the slice does not contain
+     * {@code codePointCount} code points after {@code position}, {@code -1}
+     * is returned.
+     * <p>
+     * Note: This method does not explicitly check for valid UTF-8, and may
+     * return incorrect results or throw an exception for invalid UTF-8.
+     */
+    public static int offsetOfCodePoint(Slice utf8, int position, int codePointCount)
+    {
+        checkPositionIndex(position, utf8.length());
+        checkArgument(codePointCount >= 0, "codePointPosition is negative");
+
+        // Quick exit if we are sure that the position is after the end
+        if (utf8.length() - position <= codePointCount) {
+            return -1;
+        }
+        if (codePointCount == 0) {
+            return position;
+        }
+
+        int correctIndex = codePointCount + position;
+        // Length rounded to 8 bytes
+        int length8 = utf8.length() & 0x7FFF_FFF8;
+        // While we have enough bytes left and we need at least 8 characters process 8 bytes at once
+        while (position < length8 && correctIndex >= position + 8) {
+            // Count bytes which are NOT the start of a code point
+            correctIndex += countContinuationBytes(utf8.getLongUnchecked(position));
+
+            position += 8;
+        }
+        // Length rounded to 4 bytes
+        int length4 = utf8.length() & 0x7FFF_FFFC;
+        // While we have enough bytes left and we need at least 4 characters process 4 bytes at once
+        while (position < length4 && correctIndex >= position + 4) {
+            // Count bytes which are NOT the start of a code point
+            correctIndex += countContinuationBytes(utf8.getIntUnchecked(position));
+
+            position += 4;
+        }
+        // Do the rest one by one, always check the last byte to find the end of the code point
+        while (position < utf8.length()) {
+            // Count bytes which are NOT the start of a code point
+            correctIndex += countContinuationBytes(utf8.getByteUnchecked(position));
+            if (position == correctIndex) {
+                break;
+            }
+
+            position++;
+        }
+
+        if (position == correctIndex && correctIndex < utf8.length()) {
+            return correctIndex;
+        }
+        return -1;
+    }
+
+    /**
+     * Gets the UTF-8 sequence length of the code point at {@code position}.
+     * <p>
+     * Note: This method does not explicitly check for valid UTF-8, and may
+     * return incorrect results or throw an exception for invalid UTF-8.
+     */
+    public static int lengthOfCodePoint(Slice utf8, int position)
+    {
+        return lengthOfCodePointFromStartByte(utf8.getByte(position));
+    }
+
+    /**
+     * Gets the UTF-8 sequence length of the code point at {@code position}.
+     * <p>
+     * Truncated UTF-8 sequences, 5 and 6 byte sequences, and invalid code points
+     * are handled by this method without throwing an exception.
+     */
+    public static int lengthOfCodePointSafe(Slice utf8, int position)
+    {
+        int length = lengthOfCodePointFromStartByteSafe(utf8.getByte(position));
+        if (length < 0) {
+            return -length;
+        }
+
+        if (length == 1 || position + 1 >= utf8.length() || !isContinuationByte(utf8.getByteUnchecked(position + 1))) {
+            return 1;
+        }
+
+        if (length == 2 || position + 2 >= utf8.length() || !isContinuationByte(utf8.getByteUnchecked(position + 2))) {
+            return 2;
+        }
+
+        if (length == 3 || position + 3 >= utf8.length() || !isContinuationByte(utf8.getByteUnchecked(position + 3))) {
+            return 3;
+        }
+
+        if (length == 4 || position + 4 >= utf8.length() || !isContinuationByte(utf8.getByteUnchecked(position + 4))) {
+            return 4;
+        }
+
+        if (length == 5 || position + 5 >= utf8.length() || !isContinuationByte(utf8.getByteUnchecked(position + 5))) {
+            return 5;
+        }
+
+        if (length == 6) {
+            return 6;
+        }
+
+        return 1;
+    }
+
+    /**
+     * Gets the UTF-8 sequence length of the code point.
+     *
+     * @throws InvalidCodePointException if code point is not within a valid range
+     */
+    public static int lengthOfCodePoint(int codePoint)
+    {
+        if (codePoint < 0) {
+            throw new InvalidCodePointException(codePoint);
+        }
+        if (codePoint < 0x80) {
+            // normal ASCII
+            // 0xxx_xxxx
+            return 1;
+        }
+        if (codePoint < 0x800) {
+            return 2;
+        }
+        if (codePoint < 0x1_0000) {
+            return 3;
+        }
+        if (codePoint < 0x11_0000) {
+            return 4;
+        }
+        // Per RFC3629, UTF-8 is limited to 4 bytes, so more bytes are illegal
+        throw new InvalidCodePointException(codePoint);
+    }
+
+    /**
+     * Gets the UTF-8 sequence length using the sequence start byte.
+     * <p>
+     * Note: This method does not explicitly check for valid UTF-8, and may
+     * return incorrect results or throw an exception for invalid UTF-8.
+     */
+    public static int lengthOfCodePointFromStartByte(byte startByte)
+    {
+        int unsignedStartByte = startByte & 0xFF;
+        if (unsignedStartByte < 0x80) {
+            // normal ASCII
+            // 0xxx_xxxx
+            return 1;
+        }
+        if (unsignedStartByte < 0xc0) {
+            // illegal bytes
+            // 10xx_xxxx
+            throw new InvalidUtf8Exception("Illegal start 0x" + toHexString(unsignedStartByte).toUpperCase() + " of code point");
+        }
+        if (unsignedStartByte < 0xe0) {
+            // 110x_xxxx 10xx_xxxx
+            return 2;
+        }
+        if (unsignedStartByte < 0xf0) {
+            // 1110_xxxx 10xx_xxxx 10xx_xxxx
+            return 3;
+        }
+        if (unsignedStartByte < 0xf8) {
+            // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
+            return 4;
+        }
+        // Per RFC3629, UTF-8 is limited to 4 bytes, so more bytes are illegal
+        throw new InvalidUtf8Exception("Illegal start 0x" + toHexString(unsignedStartByte).toUpperCase() + " of code point");
+    }
+
+    /**
+     * Gets the UTF-8 encoded code point at the {@code position}.
+     * <p>
+     * Note: This method does not explicitly check for valid UTF-8, and may
+     * return incorrect results or throw an exception for invalid UTF-8.
+     */
+    public static int getCodePointAt(Slice utf8, int position)
+    {
+        int unsignedStartByte = utf8.getByte(position) & 0xFF;
+        if (unsignedStartByte < 0x80) {
+            // normal ASCII
+            // 0xxx_xxxx
+            return unsignedStartByte;
+        }
+        if (unsignedStartByte < 0xc0) {
+            // illegal bytes
+            // 10xx_xxxx
+            throw new InvalidUtf8Exception("Illegal start 0x" + toHexString(unsignedStartByte).toUpperCase() + " of code point");
+        }
+        if (unsignedStartByte < 0xe0) {
+            // 110x_xxxx 10xx_xxxx
+            if (position + 1 >= utf8.length()) {
+                throw new InvalidUtf8Exception("UTF-8 sequence truncated");
+            }
+            return ((unsignedStartByte & 0b0001_1111) << 6) |
+                    (utf8.getByte(position + 1) & 0b0011_1111);
+        }
+        if (unsignedStartByte < 0xf0) {
+            // 1110_xxxx 10xx_xxxx 10xx_xxxx
+            if (position + 2 >= utf8.length()) {
+                throw new InvalidUtf8Exception("UTF-8 sequence truncated");
+            }
+            return ((unsignedStartByte & 0b0000_1111) << 12) |
+                    ((utf8.getByteUnchecked(position + 1) & 0b0011_1111) << 6) |
+                    (utf8.getByteUnchecked(position + 2) & 0b0011_1111);
+        }
+        if (unsignedStartByte < 0xf8) {
+            // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
+            if (position + 3 >= utf8.length()) {
+                throw new InvalidUtf8Exception("UTF-8 sequence truncated");
+            }
+            return ((unsignedStartByte & 0b0000_0111) << 18) |
+                    ((utf8.getByteUnchecked(position + 1) & 0b0011_1111) << 12) |
+                    ((utf8.getByteUnchecked(position + 2) & 0b0011_1111) << 6) |
+                    (utf8.getByteUnchecked(position + 3) & 0b0011_1111);
+        }
+        // Per RFC3629, UTF-8 is limited to 4 bytes, so more bytes are illegal
+        throw new InvalidUtf8Exception("Illegal start 0x" + toHexString(unsignedStartByte).toUpperCase() + " of code point");
+    }
+
+    /**
+     * Gets the UTF-8 encoded code point before the {@code position}.
+     * <p>
+     * Note: This method does not explicitly check for valid UTF-8, and may
+     * return incorrect results or throw an exception for invalid UTF-8.
+     */
+    public static int getCodePointBefore(Slice utf8, int position)
+    {
+        byte unsignedByte = utf8.getByte(position - 1);
+        if (!isContinuationByte(unsignedByte)) {
+            return unsignedByte & 0xFF;
+        }
+        if (!isContinuationByte(utf8.getByte(position - 2))) {
+            return getCodePointAt(utf8, position - 2);
+        }
+        if (!isContinuationByte(utf8.getByte(position - 3))) {
+            return getCodePointAt(utf8, position - 3);
+        }
+        if (!isContinuationByte(utf8.getByte(position - 4))) {
+            return getCodePointAt(utf8, position - 4);
+        }
+
+        // Per RFC3629, UTF-8 is limited to 4 bytes, so more bytes are illegal
+        throw new InvalidUtf8Exception("UTF-8 is not well formed");
+    }
+
+    private static boolean isContinuationByte(byte b)
+    {
+        return (b & 0b1100_0000) == 0b1000_0000;
+    }
+
+    /**
+     * Convert the code point to UTF-8.
+     * <p>
+     *
+     * @throws InvalidCodePointException if code point is not within a valid range
+     */
+    public static Slice codePointToUtf8(int codePoint)
+    {
+        Slice utf8 = Slices.allocate(lengthOfCodePoint(codePoint));
+        setCodePointAt(codePoint, utf8, 0);
+        return utf8;
+    }
+
+    /**
+     * Sets the UTF-8 sequence for code point at the {@code position}.
+     *
+     * @throws InvalidCodePointException if code point is not within a valid range
+     */
+    public static int setCodePointAt(int codePoint, Slice utf8, int position)
+    {
+        if (codePoint < 0) {
+            throw new InvalidCodePointException(codePoint);
+        }
+        if (codePoint < 0x80) {
+            // normal ASCII
+            // 0xxx_xxxx
+            utf8.setByte(position, codePoint);
+            return 1;
+        }
+        if (codePoint < 0x800) {
+            // 110x_xxxx 10xx_xxxx
+            utf8.setByte(position, 0b1100_0000 | (codePoint >>> 6));
+            utf8.setByte(position + 1, 0b1000_0000 | (codePoint & 0b0011_1111));
+            return 2;
+        }
+        if (MIN_SURROGATE <= codePoint && codePoint <= MAX_SURROGATE) {
+            throw new InvalidCodePointException(codePoint);
+        }
+        if (codePoint < 0x1_0000) {
+            // 1110_xxxx 10xx_xxxx 10xx_xxxx
+            utf8.setByte(position, 0b1110_0000 | ((codePoint >>> 12) & 0b0000_1111));
+            utf8.setByte(position + 1, 0b1000_0000 | ((codePoint >>> 6) & 0b0011_1111));
+            utf8.setByte(position + 2, 0b1000_0000 | (codePoint & 0b0011_1111));
+            return 3;
+        }
+        if (codePoint < 0x11_0000) {
+            // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
+            utf8.setByte(position, 0b1111_0000 | ((codePoint >>> 18) & 0b0000_0111));
+            utf8.setByte(position + 1, 0b1000_0000 | ((codePoint >>> 12) & 0b0011_1111));
+            utf8.setByte(position + 2, 0b1000_0000 | ((codePoint >>> 6) & 0b0011_1111));
+            utf8.setByte(position + 3, 0b1000_0000 | (codePoint & 0b0011_1111));
+            return 4;
+        }
+        // Per RFC3629, UTF-8 is limited to 4 bytes, so more bytes are illegal
+        throw new InvalidCodePointException(codePoint);
+    }
+
+    private static int countContinuationBytes(byte i8)
+    {
+        // see below
+        int value = i8 & 0xff;
+        return (value >>> 7) & (~value >>> 6);
+    }
+
+    private static int countContinuationBytes(int i32)
+    {
+        // see below
+        i32 = ((i32 & TOP_MASK32) >>> 1) & (~i32);
+        return Integer.bitCount(i32);
+    }
+
+    private static int countContinuationBytes(long i64)
+    {
+        // Count the number of bytes that match 0b10xx_xxxx as follows:
+        // 1. Mask off the 8th bit of every byte and shift it into the 7th position.
+        // 2. Then invert the bytes, which turns the 0 in the 7th bit to a one.
+        // 3. And together the restults of step 1 and 2, giving us a one in the 7th
+        //    position if the byte matched.
+        // 4. Count the number of bits in the result, which is the number of bytes
+        //    that matched.
+        i64 = ((i64 & TOP_MASK64) >>> 1) & (~i64);
+        return Long.bitCount(i64);
+    }
+}


=====================================
src/main/java/io/airlift/slice/Slices.java
=====================================
@@ -59,7 +59,7 @@ public final class Slices
         else {
             newCapacity = existingSlice.length();
         }
-        int minNewCapacity = existingSlice.length() + minWritableBytes;
+        int minNewCapacity = minWritableBytes;
         while (newCapacity < minNewCapacity) {
             if (newCapacity < SLICE_ALLOC_THRESHOLD) {
                 newCapacity <<= 1;
@@ -69,7 +69,7 @@ public final class Slices
             }
         }
 
-        Slice newSlice = Slices.allocate(newCapacity);
+        Slice newSlice = allocate(newCapacity);
         newSlice.setBytes(0, existingSlice, 0, existingSlice.length());
         return newSlice;
     }
@@ -112,18 +112,18 @@ public final class Slices
     {
         if (buffer instanceof DirectBuffer) {
             DirectBuffer direct = (DirectBuffer) buffer;
-            return new Slice(null, direct.address() + buffer.position(), buffer.limit() - buffer.position(), direct);
+            return new Slice(null, direct.address() + buffer.position(), buffer.limit() - buffer.position(), buffer.capacity(), direct);
         }
 
         if (buffer.hasArray()) {
             int address = ARRAY_BYTE_BASE_OFFSET + buffer.arrayOffset() + buffer.position();
-            return new Slice(buffer.array(), address, buffer.limit() - buffer.position(), null);
+            return new Slice(buffer.array(), address, buffer.limit() - buffer.position(), buffer.array().length, null);
         }
 
         throw new IllegalArgumentException("cannot wrap " + buffer.getClass().getName());
     }
 
-    public static Slice wrappedBuffer(byte[] array)
+    public static Slice wrappedBuffer(byte... array)
     {
         if (array.length == 0) {
             return EMPTY_SLICE;


=====================================
src/main/java/io/airlift/slice/UnsafeSliceFactory.java
=====================================
@@ -70,7 +70,7 @@ public class UnsafeSliceFactory
         if (size == 0) {
             return Slices.EMPTY_SLICE;
         }
-        return new Slice(null, address, size, null);
+        return new Slice(null, address, size, 0, null);
     }
 
     /**
@@ -95,6 +95,6 @@ public class UnsafeSliceFactory
         if (size == 0) {
             return Slices.EMPTY_SLICE;
         }
-        return new Slice(null, address, size, reference);
+        return new Slice(null, address, size, size, reference);
     }
 }


=====================================
src/main/java/io/airlift/slice/XxHash64.java
=====================================
@@ -56,39 +56,11 @@ public class XxHash64
         checkPositionIndexes(0, offset + length, data.length());
 
         Object base = data.getBase();
-        long index = data.getAddress() + offset;
-        long end = index + length;
+        final long address = data.getAddress() + offset;
 
         long hash;
-
         if (length >= 32) {
-            long v1 = seed + PRIME64_1 + PRIME64_2;
-            long v2 = seed + PRIME64_2;
-            long v3 = seed + 0;
-            long v4 = seed - PRIME64_1;
-
-            long limit = end - 32;
-            do {
-                v1 = mix(v1, unsafe.getLong(base, index));
-                index += 8;
-
-                v2 = mix(v2, unsafe.getLong(base, index));
-                index += 8;
-
-                v3 = mix(v3, unsafe.getLong(base, index));
-                index += 8;
-
-                v4 = mix(v4, unsafe.getLong(base, index));
-                index += 8;
-            }
-            while (index <= limit);
-
-            hash = rotateLeft(v1, 1) + rotateLeft(v2, 7) + rotateLeft(v3, 12) + rotateLeft(v4, 18);
-
-            hash = update(hash, v1);
-            hash = update(hash, v2);
-            hash = update(hash, v3);
-            hash = update(hash, v4);
+            hash = updateBody(seed, base, address, length - 32);
         }
         else {
             hash = seed + PRIME64_5;
@@ -96,18 +68,22 @@ public class XxHash64
 
         hash += length;
 
-        while (index <= end - 8) {
-            hash = updateTail(hash, unsafe.getLong(base, index));
+        // round to the closest 32 byte boundary
+        // this is the point up to which {@see #updateBody} processed
+        int index = length & 0xFFFFFF70;
+
+        while (index <= length - 8) {
+            hash = updateTail(hash, unsafe.getLong(base, address + index));
             index += 8;
         }
 
-        if (index <= end - 4) {
-            hash = updateTail(hash, unsafe.getInt(base, index));
+        if (index <= length - 4) {
+            hash = updateTail(hash, unsafe.getInt(base, address + index));
             index += 4;
         }
 
-        while (index < end) {
-            hash = updateTail(hash, unsafe.getByte(base, index));
+        while (index < length) {
+            hash = updateTail(hash, unsafe.getByte(base, address + index));
             index++;
         }
 
@@ -116,6 +92,32 @@ public class XxHash64
         return hash;
     }
 
+    private static long updateBody(long seed, Object base, long address, int length)
+    {
+        long v1 = seed + PRIME64_1 + PRIME64_2;
+        long v2 = seed + PRIME64_2;
+        long v3 = seed + 0;
+        long v4 = seed - PRIME64_1;
+
+        for (int index = 0; index <= length; index += 32) {
+            v1 = mix(v1, unsafe.getLong(base, address));
+            v2 = mix(v2, unsafe.getLong(base, address + 8));
+            v3 = mix(v3, unsafe.getLong(base, address + 16));
+            v4 = mix(v4, unsafe.getLong(base, address + 24));
+
+            address += 32;
+        }
+
+        long hash = rotateLeft(v1, 1) + rotateLeft(v2, 7) + rotateLeft(v3, 12) + rotateLeft(v4, 18);
+
+        hash = update(hash, v1);
+        hash = update(hash, v2);
+        hash = update(hash, v3);
+        hash = update(hash, v4);
+
+        return hash;
+    }
+
     private static long mix(long current, long value)
     {
         return rotateLeft(current + value * PRIME64_2, 31) * PRIME64_1;


=====================================
src/test/java/io/airlift/slice/SliceUtf8Benchmark.java
=====================================
@@ -0,0 +1,275 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.airlift.slice;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.RunnerException;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+import org.openjdk.jmh.runner.options.VerboseMode;
+
+import java.nio.charset.StandardCharsets;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.stream.IntStream;
+
+import static io.airlift.slice.SliceUtf8.countCodePoints;
+import static io.airlift.slice.SliceUtf8.leftTrim;
+import static io.airlift.slice.SliceUtf8.lengthOfCodePoint;
+import static io.airlift.slice.SliceUtf8.lengthOfCodePointFromStartByte;
+import static io.airlift.slice.SliceUtf8.offsetOfCodePoint;
+import static io.airlift.slice.SliceUtf8.reverse;
+import static io.airlift.slice.SliceUtf8.rightTrim;
+import static io.airlift.slice.SliceUtf8.substring;
+import static io.airlift.slice.SliceUtf8.toLowerCase;
+import static io.airlift.slice.SliceUtf8.toUpperCase;
+import static io.airlift.slice.SliceUtf8.trim;
+import static io.airlift.slice.Slices.utf8Slice;
+import static java.lang.Character.MAX_CODE_POINT;
+import static java.lang.Character.SURROGATE;
+import static java.lang.Character.getType;
+import static java.util.concurrent.TimeUnit.MILLISECONDS;
+import static java.util.concurrent.TimeUnit.NANOSECONDS;
+import static org.openjdk.jmh.annotations.Mode.AverageTime;
+import static org.openjdk.jmh.annotations.Scope.Thread;
+
+ at SuppressWarnings("MethodMayBeStatic")
+ at State(Thread)
+ at OutputTimeUnit(NANOSECONDS)
+ at BenchmarkMode(AverageTime)
+ at Fork(1)
+ at Warmup(iterations = 4, time = 500, timeUnit = MILLISECONDS)
+ at Measurement(iterations = 5, time = 500, timeUnit = MILLISECONDS)
+public class SliceUtf8Benchmark
+{
+    @Benchmark
+    public int benchmarkLengthOfCodePointFromStartByte(BenchmarkData data)
+    {
+        Slice slice = data.getSlice();
+        int i = 0;
+        int codePoints = 0;
+        while (i < slice.length()) {
+            i += lengthOfCodePointFromStartByte(slice.getByte(i));
+            codePoints++;
+        }
+        if (codePoints != data.getLength()) {
+            throw new AssertionError();
+        }
+        return codePoints;
+    }
+
+    @Benchmark
+    public int benchmarkCountCodePoints(BenchmarkData data)
+    {
+        int codePoints = countCodePoints(data.getSlice());
+        if (codePoints != data.getLength()) {
+            throw new AssertionError();
+        }
+        return codePoints;
+    }
+
+    @Benchmark
+    public int benchmarkOffsetByCodePoints(BenchmarkData data)
+    {
+        Slice slice = data.getSlice();
+        int offset = offsetOfCodePoint(slice, data.getLength() - 1);
+        if (offset + lengthOfCodePoint(slice, offset) != slice.length()) {
+            throw new AssertionError();
+        }
+        return offset;
+    }
+
+    @Benchmark
+    public Slice benchmarkSubstring(BenchmarkData data)
+    {
+        Slice slice = data.getSlice();
+        int length = data.getLength();
+        return substring(slice, (length / 2) - 1, length / 2);
+    }
+
+    @Benchmark
+    public Slice benchmarkReverse(BenchmarkData data)
+    {
+        return reverse(data.getSlice());
+    }
+
+    @Benchmark
+    public Slice benchmarkToLowerCase(BenchmarkData data)
+    {
+        return toLowerCase(data.getSlice());
+    }
+
+    @Benchmark
+    public Slice benchmarkToUpperCase(BenchmarkData data)
+    {
+        return toUpperCase(data.getSlice());
+    }
+
+    @Benchmark
+    public Slice benchmarkLeftTrim(WhitespaceData data)
+    {
+        return leftTrim(data.getLeftWhitespace());
+    }
+
+    @Benchmark
+    public Slice benchmarkRightTrim(WhitespaceData data)
+    {
+        return rightTrim(data.getRightWhitespace());
+    }
+
+    @Benchmark
+    public Slice benchmarkTrim(WhitespaceData data)
+    {
+        return trim(data.getBothWhitespace());
+    }
+
+
+    @State(Thread)
+    public static class BenchmarkData
+    {
+        private static final int[] ASCII_CODE_POINTS;
+        private static final int[] ALL_CODE_POINTS;
+
+        static {
+            ASCII_CODE_POINTS = IntStream.rangeClosed(0, 0x7F)
+                    .toArray();
+            ALL_CODE_POINTS = IntStream.rangeClosed(0, MAX_CODE_POINT)
+                    .filter(codePoint -> getType(codePoint) != SURROGATE)
+                    .toArray();
+        }
+
+        @Param({ "2", "5", "10", "100", "1000", "10000" })
+        private int length;
+
+        @Param({ "true", "false" })
+        private boolean ascii;
+
+        private Slice slice;
+        private int[] codePoints;
+
+        @Setup
+        public void setup()
+        {
+            int[] codePointSet = ascii ? ASCII_CODE_POINTS : ALL_CODE_POINTS;
+            ThreadLocalRandom random = ThreadLocalRandom.current();
+
+            codePoints = new int[length];
+            DynamicSliceOutput sliceOutput = new DynamicSliceOutput(length * 4);
+            for (int i = 0; i < codePoints.length; i++) {
+                int codePoint = codePointSet[random.nextInt(codePointSet.length)];
+                codePoints[i] = codePoint;
+                sliceOutput.appendBytes(new String(Character.toChars(codePoint)).getBytes(StandardCharsets.UTF_8));
+            }
+            slice = sliceOutput.slice();
+        }
+
+        public Slice getSlice()
+        {
+            return slice;
+        }
+
+        public int getLength()
+        {
+            return length;
+        }
+    }
+
+    @State(Thread)
+    public static class WhitespaceData
+    {
+        private static final int[] ASCII_WHITESPACE;
+        private static final int[] ALL_WHITESPACE;
+
+        static {
+            ASCII_WHITESPACE = IntStream.rangeClosed(0, 0x7F)
+                    .filter(Character::isWhitespace)
+                    .toArray();
+            ALL_WHITESPACE = IntStream.rangeClosed(0, MAX_CODE_POINT)
+                    .filter(Character::isWhitespace)
+                    .toArray();
+        }
+
+        @Param({ "2", "5", "10", "100", "1000", "10000" })
+        private int length;
+
+        @Param({ "true", "false" })
+        private boolean ascii;
+
+        private Slice leftWhitespace;
+        private Slice rightWhitespace;
+        private Slice bothWhitespace;
+
+        @Setup
+        public void setup()
+        {
+            Slice whitespace = createRandomUtf8Slice(ascii ? ASCII_WHITESPACE : ALL_WHITESPACE, length + 1);
+            leftWhitespace = Slices.copyOf(whitespace);
+            leftWhitespace.setByte(leftWhitespace.length() - 1, 'X');
+            rightWhitespace = Slices.copyOf(whitespace);
+            rightWhitespace.setByte(0, 'X');
+            bothWhitespace = Slices.copyOf(whitespace);
+            bothWhitespace.setByte(length / 2, 'X');
+        }
+
+        private static Slice createRandomUtf8Slice(int[] codePointSet, int length)
+        {
+            int[] codePoints = new int[length];
+            ThreadLocalRandom random = ThreadLocalRandom.current();
+            for (int i = 0; i < codePoints.length; i++) {
+                int codePoint = codePointSet[random.nextInt(codePointSet.length)];
+                codePoints[i] = codePoint;
+            }
+            return utf8Slice(new String(codePoints, 0, codePoints.length));
+        }
+
+        public int getLength()
+        {
+            return length;
+        }
+
+        public Slice getLeftWhitespace()
+        {
+            return leftWhitespace;
+        }
+
+        public Slice getRightWhitespace()
+        {
+            return rightWhitespace;
+        }
+
+        public Slice getBothWhitespace()
+        {
+            return bothWhitespace;
+        }
+    }
+
+    public static void main(String[] args)
+            throws RunnerException
+    {
+        Options options = new OptionsBuilder()
+                .verbosity(VerboseMode.NORMAL)
+                .include(".*" + SliceUtf8Benchmark.class.getSimpleName() + ".*")
+                .build();
+
+        new Runner(options).run();
+    }
+}


=====================================
src/test/java/io/airlift/slice/TestSlice.java
=====================================
@@ -13,7 +13,6 @@
  */
 package io.airlift.slice;
 
-import org.testng.Assert;
 import org.testng.annotations.Test;
 
 import java.io.ByteArrayInputStream;
@@ -200,9 +199,9 @@ public class TestSlice
         String s = "apple \u2603 snowman";
         Slice slice = Slices.copiedBuffer(s, UTF_8);
 
-        assertEquals(Slices.utf8Slice(s), slice);
+        assertEquals(utf8Slice(s), slice);
         assertEquals(slice.toStringUtf8(), s);
-        assertEquals(Slices.utf8Slice(s).toStringUtf8(), s);
+        assertEquals(utf8Slice(s).toStringUtf8(), s);
     }
 
     @SuppressWarnings("CharUsedInArithmeticContext")
@@ -676,29 +675,41 @@ public class TestSlice
         assertEquals(slice.getBytes(), output.getBytes());
     }
 
+    @Test
+    public void testRetainedSize()
+            throws Exception
+    {
+        Slice slice = Slices.allocate(10);
+        assertEquals(slice.getRetainedSize(), 10);
+        assertEquals(slice.length(), 10);
+        Slice subSlice = slice.slice(0, 1);
+        assertEquals(subSlice.getRetainedSize(), 10);
+        assertEquals(subSlice.length(), 1);
+    }
+
     @Test
     public void testCopyOf()
             throws Exception
     {
         // slightly stronger guarantees for empty slice
         assertSame(Slices.copyOf(EMPTY_SLICE), EMPTY_SLICE);
-        assertSame(Slices.copyOf(Slices.utf8Slice("hello world"), 1, 0), EMPTY_SLICE);
+        assertSame(Slices.copyOf(utf8Slice("hello world"), 1, 0), EMPTY_SLICE);
 
-        Slice slice = Slices.utf8Slice("hello world");
+        Slice slice = utf8Slice("hello world");
         assertEquals(Slices.copyOf(slice), slice);
         assertEquals(Slices.copyOf(slice, 1, 3), slice.slice(1, 3));
 
         // verify it's an actual copy
-        Slice original = Slices.utf8Slice("hello world");
+        Slice original = utf8Slice("hello world");
         Slice copy = Slices.copyOf(original);
 
         original.fill((byte) 0);
-        assertEquals(copy, Slices.utf8Slice("hello world"));
+        assertEquals(copy, utf8Slice("hello world"));
 
         // read before beginning
         try {
             Slices.copyOf(slice, -1, slice.length());
-            Assert.fail();
+            fail();
         }
         catch (IndexOutOfBoundsException ignored) {
         }
@@ -706,7 +717,7 @@ public class TestSlice
         // read after end
         try {
             Slices.copyOf(slice, slice.length() + 1, 1);
-            Assert.fail();
+            fail();
         }
         catch (IndexOutOfBoundsException ignored) {
         }
@@ -714,12 +725,64 @@ public class TestSlice
         // start before but extend past end
         try {
             Slices.copyOf(slice, 1, slice.length());
-            Assert.fail();
+            fail();
         }
         catch (IndexOutOfBoundsException ignored) {
         }
+    }
+
+    @Test
+    public void testIndexOf()
+            throws Exception
+    {
+        assertIndexOf(utf8Slice("no-match-bigger"), utf8Slice("test"));
+        assertIndexOf(utf8Slice("no"), utf8Slice("test"));
+
+        assertIndexOf(utf8Slice("test"), utf8Slice("test"));
+        assertIndexOf(utf8Slice("test-start"), utf8Slice("test"));
+        assertIndexOf(utf8Slice("end-test"), utf8Slice("test"));
+        assertIndexOf(utf8Slice("a-test-middle"), utf8Slice("test"));
+        assertIndexOf(utf8Slice("this-test-is-a-test"), utf8Slice("test"));
+
+        assertIndexOf(utf8Slice("test"), EMPTY_SLICE, 0, 0);
+        assertIndexOf(EMPTY_SLICE, utf8Slice("test"), 0, -1);
 
+        assertIndexOf(utf8Slice("test"), utf8Slice("no"), 4, -1);
+        assertIndexOf(utf8Slice("test"), utf8Slice("no"), 5, -1);
+        assertIndexOf(utf8Slice("test"), utf8Slice("no"), -1, -1);
+    }
+
+    public static void assertIndexOf(Slice data, Slice pattern, int offset, int expected)
+    {
+        assertEquals(data.indexOf(pattern, offset), expected);
+        assertEquals(data.indexOfBruteForce(pattern, offset), expected);
+    }
+
+    public static void assertIndexOf(Slice data, Slice pattern)
+    {
+        int index;
+
+        List<Integer> bruteForce = new ArrayList<>();
+        index = 0;
+        while (index >= 0 && index < data.length()) {
+            index = data.indexOfBruteForce(pattern, index);
+            if (index >= 0) {
+                bruteForce.add(index);
+                index++;
+            }
+        }
+
+        List<Integer> indexOf = new ArrayList<>();
+        index = 0;
+        while (index >= 0 && index < data.length()) {
+            index = data.indexOf(pattern, index);
+            if (index >= 0) {
+                indexOf.add(index);
+                index++;
+            }
+        }
 
+        assertEquals(bruteForce, indexOf);
     }
 
     private static List<Long> createRandomLongs(int count)


=====================================
src/test/java/io/airlift/slice/TestSliceUtf8.java
=====================================
@@ -0,0 +1,711 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.airlift.slice;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+import com.google.common.primitives.Ints;
+import org.testng.annotations.Test;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.stream.IntStream;
+
+import static com.google.common.primitives.Bytes.concat;
+import static io.airlift.slice.SliceUtf8.codePointToUtf8;
+import static io.airlift.slice.SliceUtf8.countCodePoints;
+import static io.airlift.slice.SliceUtf8.fixInvalidUtf8;
+import static io.airlift.slice.SliceUtf8.getCodePointAt;
+import static io.airlift.slice.SliceUtf8.getCodePointBefore;
+import static io.airlift.slice.SliceUtf8.isAscii;
+import static io.airlift.slice.SliceUtf8.leftTrim;
+import static io.airlift.slice.SliceUtf8.lengthOfCodePoint;
+import static io.airlift.slice.SliceUtf8.lengthOfCodePointFromStartByte;
+import static io.airlift.slice.SliceUtf8.lengthOfCodePointSafe;
+import static io.airlift.slice.SliceUtf8.offsetOfCodePoint;
+import static io.airlift.slice.SliceUtf8.reverse;
+import static io.airlift.slice.SliceUtf8.rightTrim;
+import static io.airlift.slice.SliceUtf8.setCodePointAt;
+import static io.airlift.slice.SliceUtf8.substring;
+import static io.airlift.slice.SliceUtf8.toLowerCase;
+import static io.airlift.slice.SliceUtf8.toUpperCase;
+import static io.airlift.slice.SliceUtf8.trim;
+import static io.airlift.slice.Slices.EMPTY_SLICE;
+import static io.airlift.slice.Slices.utf8Slice;
+import static io.airlift.slice.Slices.wrappedBuffer;
+import static java.lang.Character.MAX_CODE_POINT;
+import static java.lang.Character.MIN_SURROGATE;
+import static java.lang.Character.SURROGATE;
+import static java.lang.Character.getType;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertFalse;
+import static org.testng.Assert.assertTrue;
+
+public class TestSliceUtf8
+{
+    private static final int[] ASCII_CODE_POINTS;
+    private static final String STRING_ASCII_CODE_POINTS;
+    private static final int[] ALL_CODE_POINTS;
+    private static final String STRING_ALL_CODE_POINTS;
+    private static final int[] ALL_CODE_POINTS_RANDOM;
+    private static final String STRING_ALL_CODE_POINTS_RANDOM;
+
+    private static final byte START_1_BYTE = (byte) 0b0111_1111;
+    private static final byte CONTINUATION_BYTE = (byte) 0b1011_1111;
+    private static final byte START_2_BYTE = (byte) 0b1101_1111;
+    private static final byte START_3_BYTE = (byte) 0b1110_1111;
+    private static final byte START_4_BYTE = (byte) 0b1111_0111;
+    private static final byte START_5_BYTE = (byte) 0b1111_1011;
+    private static final byte START_6_BYTE = (byte) 0b1111_1101;
+    private static final byte INVALID_FE_BYTE = (byte) 0b11111110;
+    private static final byte INVALID_FF_BYTE = (byte) 0b11111111;
+    private static final byte X_CHAR = (byte) 'X';
+
+    private static final List<byte[]> INVALID_SEQUENCES;
+
+    static {
+        ASCII_CODE_POINTS = IntStream.rangeClosed(0, 0x7F)
+                .toArray();
+        STRING_ASCII_CODE_POINTS = new String(ASCII_CODE_POINTS, 0, ASCII_CODE_POINTS.length);
+
+        ALL_CODE_POINTS = IntStream.rangeClosed(0, MAX_CODE_POINT)
+                .filter(codePoint -> getType(codePoint) != SURROGATE)
+                .toArray();
+        STRING_ALL_CODE_POINTS = new String(ALL_CODE_POINTS, 0, ALL_CODE_POINTS.length);
+
+        ALL_CODE_POINTS_RANDOM = Arrays.copyOf(ALL_CODE_POINTS, ALL_CODE_POINTS.length);
+        Collections.shuffle(Arrays.asList(ALL_CODE_POINTS_RANDOM));
+        STRING_ALL_CODE_POINTS_RANDOM = new String(ALL_CODE_POINTS_RANDOM, 0, ALL_CODE_POINTS_RANDOM.length);
+
+        ImmutableList.Builder<byte[]> invalidSequences = ImmutableList.builder();
+        invalidSequences.add(new byte[] {CONTINUATION_BYTE});
+        invalidSequences.add(new byte[] {START_2_BYTE});
+        invalidSequences.add(new byte[] {START_3_BYTE});
+        invalidSequences.add(new byte[] {START_3_BYTE, CONTINUATION_BYTE});
+        invalidSequences.add(new byte[] {START_4_BYTE});
+        invalidSequences.add(new byte[] {START_4_BYTE, CONTINUATION_BYTE});
+        invalidSequences.add(new byte[] {START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+        // 4 byte sequence is limited to 10FFFF
+        invalidSequences.add(new byte[] {START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+        invalidSequences.add(new byte[] {START_5_BYTE});
+        invalidSequences.add(new byte[] {START_5_BYTE, CONTINUATION_BYTE});
+        invalidSequences.add(new byte[] {START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+        invalidSequences.add(new byte[] {START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+        invalidSequences.add(new byte[] {START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+        invalidSequences.add(new byte[] {START_6_BYTE});
+        invalidSequences.add(new byte[] {START_6_BYTE, CONTINUATION_BYTE});
+        invalidSequences.add(new byte[] {START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+        invalidSequences.add(new byte[] {START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+        invalidSequences.add(new byte[] {START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+        invalidSequences.add(new byte[] {START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE});
+        invalidSequences.add(new byte[] {INVALID_FF_BYTE});
+
+        // min and max surrogate characters
+        invalidSequences.add(new byte[] {(byte) 0b11101101, (byte) 0xA0, (byte) 0x80});
+        invalidSequences.add(new byte[] {(byte) 0b11101101, (byte) 0xBF, (byte) 0xBF});
+        INVALID_SEQUENCES = invalidSequences.build();
+    }
+
+    private static final String STRING_EMPTY = "";
+    private static final String STRING_HELLO = "hello";
+    private static final String STRING_QUADRATICALLY = "Quadratically";
+    private static final String STRING_OESTERREICH = "\u00D6sterreich";
+    private static final String STRING_DULIOE_DULIOE = "Duli\u00F6 duli\u00F6";
+    private static final String STRING_FAITH_HOPE_LOVE = "\u4FE1\u5FF5,\u7231,\u5E0C\u671B";
+    private static final String STRING_NAIVE = "na\u00EFve";
+    private static final String STRING_OO = "\uD801\uDC2Dend";
+    // length increase when cast to lower case, and ends with invalid character
+    private static final byte[] INVALID_SEQUENCE_TO_LOWER_EXPANDS = new byte[] {(byte) 0xC8, (byte) 0xBA, (byte) 0xFF};
+
+    private static final byte[] INVALID_UTF8_1 = new byte[] {-127};
+    private static final byte[] INVALID_UTF8_2 = new byte[] {50, -127, 52, 50};
+
+    @Test
+    public void testCodePointCount()
+    {
+        assertCodePointCount(STRING_EMPTY);
+        assertCodePointCount(STRING_HELLO);
+        assertCodePointCount(STRING_QUADRATICALLY);
+        assertCodePointCount(STRING_OESTERREICH);
+        assertCodePointCount(STRING_DULIOE_DULIOE);
+        assertCodePointCount(STRING_FAITH_HOPE_LOVE);
+        assertCodePointCount(STRING_NAIVE);
+        assertCodePointCount(STRING_OO);
+        assertCodePointCount(STRING_ASCII_CODE_POINTS);
+        assertCodePointCount(STRING_ALL_CODE_POINTS);
+        assertCodePointCount(STRING_ALL_CODE_POINTS_RANDOM);
+
+        assertEquals(countCodePoints(wrappedBuffer(START_1_BYTE)), 1);
+        assertEquals(countCodePoints(wrappedBuffer(START_2_BYTE)), 1);
+        assertEquals(countCodePoints(wrappedBuffer(START_3_BYTE)), 1);
+        assertEquals(countCodePoints(wrappedBuffer(START_4_BYTE)), 1);
+        assertEquals(countCodePoints(wrappedBuffer(START_5_BYTE)), 1);
+        assertEquals(countCodePoints(wrappedBuffer(START_6_BYTE)), 1);
+        assertEquals(countCodePoints(wrappedBuffer(INVALID_FE_BYTE)), 1);
+        assertEquals(countCodePoints(wrappedBuffer(INVALID_FF_BYTE)), 1);
+        assertEquals(countCodePoints(wrappedBuffer(CONTINUATION_BYTE)), 0);
+    }
+
+    private static void assertCodePointCount(String string)
+    {
+        assertEquals(countCodePoints(utf8Slice(string)), string.codePoints().count());
+    }
+
+    @Test
+    public void testOffsetByCodePoints()
+    {
+        assertEquals(offsetOfCodePoint(EMPTY_SLICE, 0), -1);
+        assertOffsetByCodePoints(STRING_HELLO);
+        assertOffsetByCodePoints(STRING_QUADRATICALLY);
+        assertOffsetByCodePoints(STRING_OESTERREICH);
+        assertOffsetByCodePoints(STRING_DULIOE_DULIOE);
+        assertOffsetByCodePoints(STRING_FAITH_HOPE_LOVE);
+        assertOffsetByCodePoints(STRING_NAIVE);
+        assertOffsetByCodePoints(STRING_OO);
+        assertOffsetByCodePoints(STRING_ASCII_CODE_POINTS);
+        assertOffsetByCodePoints(STRING_ALL_CODE_POINTS);
+        assertOffsetByCodePoints(STRING_ALL_CODE_POINTS_RANDOM);
+    }
+
+    private static void assertOffsetByCodePoints(String string)
+    {
+        Slice utf8 = utf8Slice(string);
+
+        int codePoints = (int) string.codePoints().count();
+        int lastIndex = 0;
+        int characterIndex = 0;
+        for (int codePointIndex = 0; codePointIndex < codePoints; codePointIndex++) {
+            int expectedIndex = 0;
+
+            // calculate the expected index by searching forward from the last index
+            if (codePointIndex > 0) {
+                expectedIndex = lastIndex + lengthOfCodePoint(string.codePointAt(characterIndex));
+                characterIndex = string.offsetByCodePoints(characterIndex, 1);
+            }
+            // avoid n^2 performance for large test string
+            if (codePointIndex < 10000) {
+                assertEquals(offsetOfCodePoint(utf8, codePointIndex), expectedIndex);
+            }
+
+            if (codePointIndex > 0) {
+                assertEquals(offsetOfCodePoint(utf8, lastIndex, 1), expectedIndex);
+            }
+            lastIndex = expectedIndex;
+        }
+        assertEquals(offsetOfCodePoint(utf8Slice(string), codePoints), -1);
+    }
+
+    @Test
+    public void testSubstring()
+    {
+        assertSubstring(STRING_HELLO);
+        assertSubstring(STRING_QUADRATICALLY);
+        assertSubstring(STRING_OESTERREICH);
+        assertSubstring(STRING_DULIOE_DULIOE);
+        assertSubstring(STRING_FAITH_HOPE_LOVE);
+        assertSubstring(STRING_NAIVE);
+        assertSubstring(STRING_OO);
+        assertSubstring(STRING_ASCII_CODE_POINTS);
+        // substring test over all code points takes too long, so only run it on the tail
+        // that has the largest code points
+        assertSubstring(new String(ALL_CODE_POINTS, ALL_CODE_POINTS.length - 500, 500));
+    }
+
+    private static void assertSubstring(String string)
+    {
+        Slice utf8 = utf8Slice(string);
+
+        int[] codePoints = string.codePoints().toArray();
+        for (int start = 0; start < codePoints.length / 2; start++) {
+            int count = Math.min(20, codePoints.length - start - start - 1);
+            Slice actual = substring(utf8, start, count);
+            Slice expected = wrappedBuffer(new String(codePoints, start, count).getBytes(UTF_8));
+            assertEquals(actual, expected);
+        }
+        assertEquals(substring(utf8, 0, codePoints.length), utf8);
+        assertEquals(substring(utf8, 0, 0), EMPTY_SLICE);
+    }
+
+    @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "UTF-8 does not contain 10 code points")
+    public void testSubstringInvalidStart()
+    {
+        substring(utf8Slice(STRING_HELLO), 10, 2);
+    }
+
+    @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "UTF-8 does not contain 7 code points")
+    public void testSubstringInvalidLength()
+    {
+        substring(utf8Slice(STRING_HELLO), 0, 7);
+    }
+
+    @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 is not well formed")
+    public void testSubstringInvalidUtf8()
+    {
+        substring(wrappedBuffer((byte) 'f', (byte) 'o', (byte) 'o', START_3_BYTE, CONTINUATION_BYTE), 0, 4);
+    }
+
+    @Test
+    public void testReverse()
+    {
+        assertReverse(STRING_HELLO);
+        assertReverse(STRING_QUADRATICALLY);
+        assertReverse(STRING_OESTERREICH);
+        assertReverse(STRING_DULIOE_DULIOE);
+        assertReverse(STRING_FAITH_HOPE_LOVE);
+        assertReverse(STRING_NAIVE);
+        assertReverse(STRING_OO);
+        assertReverse(STRING_ASCII_CODE_POINTS);
+        assertReverse(STRING_ALL_CODE_POINTS);
+
+        INVALID_SEQUENCES.forEach(TestSliceUtf8::assertReverseWithInvalidSequence);
+    }
+
+    private static void assertReverse(String string)
+    {
+        Slice actualReverse = reverse(utf8Slice(string));
+
+        int[] codePoints = string.codePoints().toArray();
+        codePoints = Ints.toArray(Lists.reverse(Ints.asList(codePoints)));
+        Slice expectedReverse = wrappedBuffer(new String(codePoints, 0, codePoints.length).getBytes(UTF_8));
+
+        assertEquals(actualReverse, expectedReverse);
+    }
+
+    private static void assertReverseWithInvalidSequence(byte[] invalidSequence)
+    {
+        assertEquals(
+                reverse(wrappedBuffer(invalidSequence)),
+                wrappedBuffer(invalidSequence));
+        assertEquals(
+                reverse(wrappedBuffer(concat(new byte[] {'a', 'b', 'c'}, invalidSequence))),
+                wrappedBuffer(concat(invalidSequence, new byte[] {'c', 'b', 'a'})));
+        assertEquals(
+                reverse(wrappedBuffer(concat(invalidSequence, new byte[] {'x', 'y', 'z'}))),
+                wrappedBuffer(concat(new byte[] {'z', 'y', 'x'}, invalidSequence)));
+        assertEquals(
+                reverse(wrappedBuffer(concat(new byte[] {'a', 'b', 'c'}, invalidSequence, new byte[] {'x', 'y', 'z'}))),
+                wrappedBuffer(concat(new byte[] {'z', 'y', 'x'}, invalidSequence, new byte[] {'c', 'b', 'a'})));
+    }
+
+    @Test
+    public void testIsAscii()
+    {
+        assertTrue(isAscii(utf8Slice(STRING_HELLO)));
+        assertTrue(isAscii(utf8Slice(STRING_QUADRATICALLY)));
+        assertFalse(isAscii(utf8Slice(STRING_OESTERREICH)));
+        assertFalse(isAscii(utf8Slice(STRING_DULIOE_DULIOE)));
+        assertFalse(isAscii(utf8Slice(STRING_FAITH_HOPE_LOVE)));
+        assertFalse(isAscii(utf8Slice(STRING_NAIVE)));
+        assertFalse(isAscii(utf8Slice(STRING_OO)));
+        assertTrue(isAscii(utf8Slice(STRING_ASCII_CODE_POINTS)));
+        assertFalse(isAscii(utf8Slice(STRING_ALL_CODE_POINTS)));
+    }
+
+    @Test
+    public void testFixInvalidUtf8()
+    {
+        assertFixInvalidUtf8(utf8Slice(STRING_OESTERREICH), utf8Slice(STRING_OESTERREICH));
+        assertFixInvalidUtf8(utf8Slice(STRING_HELLO), utf8Slice(STRING_HELLO));
+        assertFixInvalidUtf8(utf8Slice(STRING_QUADRATICALLY), utf8Slice(STRING_QUADRATICALLY));
+        assertFixInvalidUtf8(utf8Slice(STRING_OESTERREICH), utf8Slice(STRING_OESTERREICH));
+        assertFixInvalidUtf8(utf8Slice(STRING_DULIOE_DULIOE), utf8Slice(STRING_DULIOE_DULIOE));
+        assertFixInvalidUtf8(utf8Slice(STRING_FAITH_HOPE_LOVE), utf8Slice(STRING_FAITH_HOPE_LOVE));
+        assertFixInvalidUtf8(utf8Slice(STRING_NAIVE), utf8Slice(STRING_NAIVE));
+        assertFixInvalidUtf8(utf8Slice(STRING_OO), utf8Slice(STRING_OO));
+        assertFixInvalidUtf8(utf8Slice(STRING_ASCII_CODE_POINTS), utf8Slice(STRING_ASCII_CODE_POINTS));
+        assertFixInvalidUtf8(utf8Slice(STRING_ALL_CODE_POINTS), utf8Slice(STRING_ALL_CODE_POINTS));
+
+        // max valid value for 2, 3, and 4 byte sequences
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_2_BYTE, CONTINUATION_BYTE), utf8Slice("X\u07FF"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_3_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFF"));
+        // 4 byte sequence is limited to U+10FFFF by RFC 3629
+        assertFixInvalidUtf8(
+                wrappedBuffer(X_CHAR, (byte) 0xF4, (byte) 0x8F, CONTINUATION_BYTE, CONTINUATION_BYTE),
+                wrappedBuffer(X_CHAR, (byte) 0xF4, (byte) 0x8F, CONTINUATION_BYTE, CONTINUATION_BYTE));
+
+        // 4 byte sequence is limited to 10FFFF
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+
+        // 5 and 6 byte sequences are always invalid
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+
+        // continuation byte alone is invalid
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, INVALID_FE_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, INVALID_FF_BYTE), utf8Slice("X\uFFFD"));
+
+        // sequences with not enough continuation bytes, but enough bytes
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_2_BYTE, X_CHAR), utf8Slice("X\uFFFDX"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_3_BYTE, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXX"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_3_BYTE, CONTINUATION_BYTE, X_CHAR), utf8Slice("X\uFFFDX"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_4_BYTE, X_CHAR, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXXX"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_4_BYTE, CONTINUATION_BYTE, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXX"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, X_CHAR), utf8Slice("X\uFFFDX"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE, X_CHAR, X_CHAR, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXXXX"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE, CONTINUATION_BYTE, X_CHAR, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXXX"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXX"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, X_CHAR), utf8Slice("X\uFFFDX"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, X_CHAR, X_CHAR, X_CHAR, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXXXXX"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, X_CHAR, X_CHAR, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXXXX"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, X_CHAR, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXXX"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, X_CHAR, X_CHAR), utf8Slice("X\uFFFDXX"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, X_CHAR), utf8Slice("X\uFFFDX"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, INVALID_FE_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, INVALID_FF_BYTE), utf8Slice("X\uFFFD"));
+
+        // truncated sequences
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_2_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_3_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_3_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_4_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_4_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, START_6_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, INVALID_FE_BYTE), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, INVALID_FF_BYTE), utf8Slice("X\uFFFD"));
+        // min and max surrogate characters
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, (byte) 0b11101101, (byte) 0xA0, (byte) 0x80), utf8Slice("X\uFFFD"));
+        assertFixInvalidUtf8(wrappedBuffer(X_CHAR, (byte) 0b11101101, (byte) 0xBF, (byte) 0xBF), utf8Slice("X\uFFFD"));
+    }
+
+    private static void assertFixInvalidUtf8(Slice testSlice, Slice expectedSlice)
+    {
+        assertEquals(fixInvalidUtf8(testSlice), expectedSlice);
+    }
+
+    @Test
+    public void testCaseChange()
+    {
+        assertCaseChange(STRING_ALL_CODE_POINTS);
+        assertCaseChange(STRING_FAITH_HOPE_LOVE);
+        assertCaseChange(STRING_HELLO);
+        assertCaseChange(STRING_QUADRATICALLY);
+        assertCaseChange(STRING_OESTERREICH);
+        assertCaseChange(STRING_DULIOE_DULIOE);
+        assertCaseChange(STRING_FAITH_HOPE_LOVE);
+        assertCaseChange(STRING_NAIVE);
+        assertCaseChange(STRING_OO);
+        assertCaseChange(STRING_ASCII_CODE_POINTS);
+        assertCaseChange(STRING_ALL_CODE_POINTS);
+        assertCaseChange(STRING_ALL_CODE_POINTS_RANDOM);
+
+        toLowerCase(Slices.wrappedBuffer(INVALID_SEQUENCE_TO_LOWER_EXPANDS));
+
+        INVALID_SEQUENCES.forEach(TestSliceUtf8::assertCaseChangeWithInvalidSequence);
+    }
+
+    private static void assertCaseChangeWithInvalidSequence(byte[] invalidSequence)
+    {
+        assertEquals(
+                toLowerCase(wrappedBuffer(invalidSequence)),
+                wrappedBuffer(invalidSequence));
+        assertEquals(
+                toUpperCase(wrappedBuffer(invalidSequence)),
+                wrappedBuffer(invalidSequence));
+
+        assertEquals(
+                toLowerCase(wrappedBuffer(concat(new byte[] {'F', 'O', 'O'}, invalidSequence))),
+                wrappedBuffer(concat(new byte[] {'f', 'o', 'o'}, invalidSequence)));
+        assertEquals(
+                toUpperCase(wrappedBuffer(concat(new byte[] {'f', 'o', 'o'}, invalidSequence))),
+                wrappedBuffer(concat(new byte[] {'F', 'O', 'O'}, invalidSequence)));
+
+        assertEquals(
+                toLowerCase(wrappedBuffer(concat(invalidSequence, new byte[] {'F', 'O', 'O'}))),
+                wrappedBuffer(concat(invalidSequence, new byte[] {'f', 'o', 'o'})));
+        assertEquals(
+                toUpperCase(wrappedBuffer(concat(invalidSequence, new byte[] {'f', 'o', 'o'}))),
+                wrappedBuffer(concat(invalidSequence, new byte[] {'F', 'O', 'O'})));
+
+        assertEquals(
+                toLowerCase(wrappedBuffer(concat(new byte[] {'F', 'O', 'O'}, invalidSequence, new byte[] {'B', 'A', 'R'}))),
+                wrappedBuffer(concat(new byte[] {'f', 'o', 'o'}, invalidSequence, new byte[] {'b', 'a', 'r'})));
+        assertEquals(
+                toUpperCase(wrappedBuffer(concat(new byte[] {'f', 'o', 'o'}, invalidSequence, new byte[] {'b', 'a', 'r'}))),
+                wrappedBuffer(concat(new byte[] {'F', 'O', 'O'}, invalidSequence, new byte[] {'B', 'A', 'R'})));
+
+    }
+
+    private static void assertCaseChange(String string)
+    {
+        String expectedLower = lowerByCodePoint(string);
+        Slice actualLower = toLowerCase(utf8Slice(string));
+        assertEquals(actualLower, wrappedBuffer(expectedLower.getBytes(UTF_8)));
+
+        String expectedUpper = upperByCodePoint(string);
+        Slice actualUpper = toUpperCase(utf8Slice(string));
+        assertEquals(actualUpper, wrappedBuffer(expectedUpper.getBytes(UTF_8)));
+
+        // lower the upper and upper the lower
+        // NOTE: not all code points roundtrip, so calculate the expected
+        assertEquals(toLowerCase(actualUpper), wrappedBuffer(lowerByCodePoint(expectedUpper).getBytes(UTF_8)));
+        assertEquals(toUpperCase(actualLower), wrappedBuffer(upperByCodePoint(expectedLower).getBytes(UTF_8)));
+    }
+
+    private static String lowerByCodePoint(String string)
+    {
+        int[] upperCodePoints = string.codePoints().map(Character::toLowerCase).toArray();
+        return new String(upperCodePoints, 0, upperCodePoints.length);
+    }
+
+    private static String upperByCodePoint(String string)
+    {
+        int[] upperCodePoints = string.codePoints().map(Character::toUpperCase).toArray();
+        return new String(upperCodePoints, 0, upperCodePoints.length);
+    }
+
+    @Test
+    public void testLeftTrim()
+    {
+        assertLeftTrim("");
+        assertLeftTrim("hello");
+        assertLeftTrim("hello world");
+        assertLeftTrim("hello world  ");
+
+        INVALID_SEQUENCES.forEach(TestSliceUtf8::assertLeftTrim);
+    }
+
+    private static void assertLeftTrim(String string)
+    {
+        assertLeftTrim(string.getBytes(UTF_8));
+    }
+
+    private static void assertLeftTrim(byte[] sequence)
+    {
+        assertEquals(leftTrim(wrappedBuffer(sequence)), wrappedBuffer(sequence));
+        for (int codePoint : ALL_CODE_POINTS) {
+            if (Character.isWhitespace(codePoint)) {
+                byte[] whitespace = new String(new int[] {codePoint}, 0, 1).getBytes(UTF_8);
+                assertEquals(leftTrim(wrappedBuffer(concat(whitespace, sequence))), wrappedBuffer(sequence));
+                assertEquals(leftTrim(wrappedBuffer(concat(whitespace, new byte[] {'\r', '\n', '\t', ' '}, whitespace, sequence))), wrappedBuffer(sequence));
+            }
+        }
+    }
+
+    @Test
+    public void testRightTrim()
+    {
+        assertRightTrim("");
+        assertRightTrim("hello");
+        assertRightTrim("hello world");
+        assertRightTrim("  hello world");
+
+        INVALID_SEQUENCES.forEach(TestSliceUtf8::assertRightTrim);
+    }
+
+    private static void assertRightTrim(String string)
+    {
+        assertRightTrim(string.getBytes(UTF_8));
+    }
+
+    private static void assertRightTrim(byte[] sequence)
+    {
+        assertEquals(rightTrim(wrappedBuffer(sequence)), wrappedBuffer(sequence));
+        for (int codePoint : ALL_CODE_POINTS) {
+            if (Character.isWhitespace(codePoint)) {
+                byte[] whitespace = new String(new int[] {codePoint}, 0, 1).getBytes(UTF_8);
+                assertEquals(rightTrim(wrappedBuffer(concat(sequence, whitespace))), wrappedBuffer(sequence));
+                assertEquals(rightTrim(wrappedBuffer(concat(sequence, whitespace, new byte[] {'\r', '\n', '\t', ' '}, whitespace))), wrappedBuffer(sequence));
+            }
+        }
+    }
+
+    @Test
+    public void testTrim()
+    {
+        assertTrim("");
+        assertTrim("hello");
+        assertTrim("hello world");
+
+        INVALID_SEQUENCES.forEach(TestSliceUtf8::assertTrim);
+    }
+
+    private static void assertTrim(String string)
+    {
+        assertTrim(string.getBytes(UTF_8));
+    }
+
+    private static void assertTrim(byte[] sequence)
+    {
+        assertEquals(trim(wrappedBuffer(sequence)), wrappedBuffer(sequence));
+        for (int codePoint : ALL_CODE_POINTS) {
+            if (Character.isWhitespace(codePoint)) {
+                byte[] whitespace = new String(new int[] {codePoint}, 0, 1).getBytes(UTF_8);
+                assertEquals(trim(wrappedBuffer(concat(whitespace, sequence, whitespace))), wrappedBuffer(sequence));
+                assertEquals(
+                        trim(wrappedBuffer(concat(whitespace, new byte[] {'\r', '\n', '\t', ' '}, whitespace, sequence, whitespace, new byte[] {'\r', '\n', '\t', ' '}, whitespace))),
+                        wrappedBuffer(sequence));
+            }
+        }
+    }
+
+    /**
+     * Test invalid UTF8 encodings. We do not expect a 'correct' but none harmful result.
+     */
+    @Test
+    public void testInvalidUtf8()
+    {
+        assertEquals(countCodePoints(wrappedBuffer(INVALID_UTF8_1)), 0);
+        assertEquals(countCodePoints(wrappedBuffer(INVALID_UTF8_2)), 3);
+
+        assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_1), 0), 0);
+        assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_1), 1), -1);
+
+        assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 0), 0);
+        assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 1), 2);
+        assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 2), 3);
+        assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 3), -1);
+    }
+
+    @Test
+    public void testLengthOfCodePoint()
+    {
+        assertEquals(lengthOfCodePointFromStartByte(START_1_BYTE), 1);
+        assertEquals(lengthOfCodePointFromStartByte(START_2_BYTE), 2);
+        assertEquals(lengthOfCodePointFromStartByte(START_3_BYTE), 3);
+        assertEquals(lengthOfCodePointFromStartByte(START_4_BYTE), 4);
+
+        for (int codePoint : ALL_CODE_POINTS) {
+            String string = new String(new int[] {codePoint}, 0, 1);
+            assertEquals(string.codePoints().count(), 1);
+
+            Slice utf8 = wrappedBuffer(string.getBytes(UTF_8));
+            assertEquals(lengthOfCodePoint(codePoint), utf8.length());
+            assertEquals(lengthOfCodePoint(utf8, 0), utf8.length());
+            assertEquals(lengthOfCodePointSafe(utf8, 0), utf8.length());
+            assertEquals(lengthOfCodePointFromStartByte(utf8.getByte(0)), utf8.length());
+
+            assertEquals(getCodePointAt(utf8, 0), codePoint);
+            assertEquals(getCodePointBefore(utf8, utf8.length()), codePoint);
+
+            assertEquals(codePointToUtf8(codePoint), utf8);
+        }
+
+        for (byte[] sequence : INVALID_SEQUENCES) {
+            assertEquals(lengthOfCodePointSafe(wrappedBuffer(sequence), 0), sequence.length);
+            assertEquals(lengthOfCodePointSafe(wrappedBuffer(concat(new byte[] {'x'}, sequence)), 1), sequence.length);
+            assertEquals(lengthOfCodePointSafe(wrappedBuffer(concat(sequence, new byte[] {'x'})), 0), sequence.length);
+        }
+    }
+
+    @Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0xFFFFFFFF")
+    public void testLengthOfNegativeCodePoint()
+    {
+        lengthOfCodePoint(-1);
+    }
+
+    @Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0x110000")
+    public void testLengthOfOutOfRangeCodePoint()
+    {
+        lengthOfCodePoint(MAX_CODE_POINT + 1);
+    }
+
+    @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "Illegal start 0xBF of code point")
+    public void testLengthOfCodePointContinuationByte()
+    {
+        lengthOfCodePointFromStartByte(CONTINUATION_BYTE);
+    }
+
+    @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "Illegal start 0xFB of code point")
+    public void testLengthOfCodePoint5ByteSequence()
+    {
+        lengthOfCodePointFromStartByte(START_5_BYTE);
+    }
+
+    @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "Illegal start 0xFD of code point")
+    public void testLengthOfCodePoint6ByteByte()
+    {
+        lengthOfCodePointFromStartByte(START_6_BYTE);
+    }
+
+    @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "Illegal start 0xFE of code point")
+    public void testLengthOfCodePointFEByte()
+    {
+        lengthOfCodePointFromStartByte(INVALID_FE_BYTE);
+    }
+
+    @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "Illegal start 0xFF of code point")
+    public void testLengthOfCodePointFFByte()
+    {
+        lengthOfCodePointFromStartByte(INVALID_FF_BYTE);
+    }
+
+    @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 sequence truncated")
+    public void testCodePointAtTruncated2()
+    {
+        getCodePointAt(wrappedBuffer((byte) 'x', START_2_BYTE), 1);
+    }
+
+    @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 sequence truncated")
+    public void testCodePointAtTruncated3()
+    {
+        getCodePointAt(wrappedBuffer((byte) 'x', START_3_BYTE, CONTINUATION_BYTE), 1);
+    }
+
+    @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 sequence truncated")
+    public void testCodePointAtTruncated4()
+    {
+        getCodePointAt(wrappedBuffer((byte) 'x', START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), 1);
+    }
+
+    @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "Illegal start 0xFB of code point")
+    public void testCodePointAt5ByteSequence()
+    {
+        getCodePointAt(wrappedBuffer((byte) 'x', START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), 1);
+    }
+
+    @Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 is not well formed")
+    public void testCodePointBefore5ByteSequence()
+    {
+        getCodePointBefore(wrappedBuffer((byte) 'x', START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), 6);
+    }
+
+    @Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0xFFFFFFFF")
+    public void testSetNegativeCodePoint()
+    {
+        setCodePointAt(-1, Slices.allocate(8), 0);
+    }
+    @Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0xD800")
+    public void testSetSurrogateCodePoint()
+    {
+        setCodePointAt(MIN_SURROGATE, Slices.allocate(8), 0);
+    }
+
+    @Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0x110000")
+    public void testSetOutOfRangeCodePoint()
+    {
+        setCodePointAt(MAX_CODE_POINT + 1, Slices.allocate(8), 0);
+    }
+
+    @Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0xFFFFFFBF")
+    public void testSetCodePointContinuationByte()
+    {
+        setCodePointAt(CONTINUATION_BYTE, Slices.allocate(8), 0);
+    }
+
+}



View it on GitLab: https://salsa.debian.org/java-team/airlift-slice/-/commit/0b1214952e2c6fe46bd5df5aeba321f4af3cc7aa

-- 
View it on GitLab: https://salsa.debian.org/java-team/airlift-slice/-/commit/0b1214952e2c6fe46bd5df5aeba321f4af3cc7aa
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/pkg-java-commits/attachments/20250116/07ca50af/attachment.htm>