[Pkg-julia-devel] [SCM] utf8proc/master: Imported Upstream version 2.1.0
ginggs at users.alioth.debian.org
ginggs at users.alioth.debian.org
Fri Jul 7 15:52:33 UTC 2017
The following commit has been merged in the master branch:
commit 6f1c1b60552164b40a7b7079f523846a244597f6
Author: Graham Inggs <ginggs at debian.org>
Date: Fri Jul 7 17:18:14 2017 +0200
Imported Upstream version 2.1.0
diff --git a/.gitignore b/.gitignore
index 532fa9a..41a6cff 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,11 +17,12 @@ data/*.sfd
bench/bench
bench/icu
bench/unistring
-normtest
-graphemetest
-printproperty
-charwidth
-valid
-iterate
-case
+test/normtest
+test/graphemetest
+test/printproperty
+test/charwidth
+test/valid
+test/iterate
+test/case
+test/custom
/tmp/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e9b8a1..be676ba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,8 +10,8 @@ project (utf8proc C)
# API version number (defined in utf8proc.h).
# Be sure to also update these in Makefile and MANIFEST!
set(SO_MAJOR 2)
-set(SO_MINOR 0)
-set(SO_PATCH 2)
+set(SO_MINOR 1)
+set(SO_PATCH 0)
add_definitions (
-DUTF8PROC_EXPORTS
diff --git a/MANIFEST b/MANIFEST
index 106a4f0..b39f8a8 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -2,6 +2,6 @@ include/
include/utf8proc.h
lib/
lib/libutf8proc.a
-lib/libutf8proc.so -> libutf8proc.so.2.0.2
-lib/libutf8proc.so.2 -> libutf8proc.so.2.0.2
-lib/libutf8proc.so.2.0.2
+lib/libutf8proc.so -> libutf8proc.so.2.1.0
+lib/libutf8proc.so.2 -> libutf8proc.so.2.1.0
+lib/libutf8proc.so.2.1.0
diff --git a/Makefile b/Makefile
index 2bde4b1..51995c3 100644
--- a/Makefile
+++ b/Makefile
@@ -21,8 +21,8 @@ UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS
# The API version number is defined in utf8proc.h.
# Be sure to also update these ABI versions in MANIFEST and CMakeLists.txt!
MAJOR=2
-MINOR=0
-PATCH=2
+MINOR=1
+PATCH=0
OS := $(shell uname)
ifeq ($(OS),Darwin) # MacOS X
@@ -49,7 +49,7 @@ clean:
ifneq ($(OS),Darwin)
rm -f libutf8proc.so.$(MAJOR)
endif
- rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case
+ rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom
rm -rf MANIFEST.new tmp
$(MAKE) -C bench clean
$(MAKE) -C data clean
@@ -136,7 +136,10 @@ test/iterate: test/iterate.c test/tests.o utf8proc.o utf8proc.h test/tests.h
test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) test/case.c test/tests.o utf8proc.o -o $@
-check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
+test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
+ $(CC) $(UCFLAGS) test/custom.c test/tests.o utf8proc.o -o $@
+
+check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
$(MAKE) -C bench
test/normtest data/NormalizationTest.txt
test/graphemetest data/GraphemeBreakTest.txt
@@ -144,3 +147,4 @@ check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeB
test/valid
test/iterate
test/case
+ test/custom
diff --git a/NEWS.md b/NEWS.md
index 6f4c131..22eff0e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,23 @@
# utf8proc release history #
+## Version 2.1 ##
+
+2016-12-16:
+
+- New functions `utf8proc_map_custom` and `utf8proc_decompose_custom`
+ to allow user-supplied transformations of codepoints, in conjunction
+ with other transformations ([#89]).
+
+- New function `utf8proc_normalize_utf32` to apply normalizations
+ directly to UTF-32 data (not just UTF-8) ([#88]).
+
+- Fixed stack overflow that could occur due to incorrect definition
+ of `UINT16_MAX` with some compilers ([#84]).
+
+- Fixed conflict with `stdbool.h` in Visual Studio ([#90]).
+
+- Updated font metrics to use Unifont 9.0.04.
+
## Version 2.0.2 ##
2016-07-27:
@@ -277,3 +295,9 @@ Release of version 1.0.1
[#70]: https://github.com/JuliaLang/utf8proc/issues/70
[#77]: https://github.com/JuliaLang/utf8proc/issues/77
[#78]: https://github.com/JuliaLang/utf8proc/issues/78
+[#79]: https://github.com/JuliaLang/utf8proc/issues/79
+[#80]: https://github.com/JuliaLang/utf8proc/issues/80
+[#84]: https://github.com/JuliaLang/utf8proc/pull/84
+[#88]: https://github.com/JuliaLang/utf8proc/pull/88
+[#89]: https://github.com/JuliaLang/utf8proc/pull/89
+[#90]: https://github.com/JuliaLang/utf8proc/issues/90
diff --git a/data/Makefile b/data/Makefile
index c41f601..19d375f 100644
--- a/data/Makefile
+++ b/data/Makefile
@@ -20,7 +20,7 @@ utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt
$(RUBY) data_generator.rb < UnicodeData.txt > $@
# GNU Unifont version for font metric calculations:
-UNIFONT_VERSION=9.0.01
+UNIFONT_VERSION=9.0.04
unifont.ttf:
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://mirrors.kernel.org/gnu/unifont/unifont-$(UNIFONT_VERSION)/unifont-$(UNIFONT_VERSION).ttf
diff --git a/test/custom.c b/test/custom.c
new file mode 100644
index 0000000..f85b3cc
--- /dev/null
+++ b/test/custom.c
@@ -0,0 +1,27 @@
+#include "tests.h"
+
+static int thunk_test = 1;
+
+static utf8proc_int32_t custom(utf8proc_int32_t codepoint, void *thunk)
+{
+ check(((int *) thunk) == &thunk_test, "unexpected thunk passed");
+ if (codepoint == 'a')
+ return 'b';
+ if (codepoint == 'S')
+ return 0x00df; /* ß */
+ return codepoint;
+}
+
+int main(void)
+{
+ utf8proc_uint8_t input[] = {0x41,0x61,0x53,0x62,0xef,0xbd,0x81,0x00}; /* "AaSb\uff41" */
+ utf8proc_uint8_t correct[] = {0x61,0x62,0x73,0x73,0x62,0x61,0x00}; /* "abssba" */
+ utf8proc_uint8_t *output;
+ utf8proc_map_custom(input, 0, &output, UTF8PROC_CASEFOLD | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_NULLTERM,
+ custom, &thunk_test);
+ printf("mapped \"%s\" -> \"%s\"\n", (char*)input, (char*)output);
+ check(strlen((char*) output) == 6, "incorrect output length");
+ check(!memcmp(correct, output, 7), "incorrect output data");
+ free(output);
+ return 0;
+}
diff --git a/utf8proc.c b/utf8proc.c
index ba5143a..c14bbe1 100644
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -166,24 +166,24 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, ut
if (uc < 0x00) {
return 0;
} else if (uc < 0x80) {
- dst[0] = uc;
+ dst[0] = (utf8proc_uint8_t) uc;
return 1;
} else if (uc < 0x800) {
- dst[0] = 0xC0 + (uc >> 6);
- dst[1] = 0x80 + (uc & 0x3F);
+ dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
+ dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
return 2;
// Note: we allow encoding 0xd800-0xdfff here, so as not to change
// the API, however, these are actually invalid in UTF-8
} else if (uc < 0x10000) {
- dst[0] = 0xE0 + (uc >> 12);
- dst[1] = 0x80 + ((uc >> 6) & 0x3F);
- dst[2] = 0x80 + (uc & 0x3F);
+ dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
+ dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
+ dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
return 3;
} else if (uc < 0x110000) {
- dst[0] = 0xF0 + (uc >> 18);
- dst[1] = 0x80 + ((uc >> 12) & 0x3F);
- dst[2] = 0x80 + ((uc >> 6) & 0x3F);
- dst[3] = 0x80 + (uc & 0x3F);
+ dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
+ dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
+ dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
+ dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
return 4;
} else return 0;
}
@@ -193,28 +193,28 @@ static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t
if (uc < 0x00) {
return 0;
} else if (uc < 0x80) {
- dst[0] = uc;
+ dst[0] = (utf8proc_uint8_t)uc;
return 1;
} else if (uc < 0x800) {
- dst[0] = 0xC0 + (uc >> 6);
- dst[1] = 0x80 + (uc & 0x3F);
+ dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
+ dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
return 2;
} else if (uc == 0xFFFF) {
- dst[0] = 0xFF;
+ dst[0] = (utf8proc_uint8_t)0xFF;
return 1;
} else if (uc == 0xFFFE) {
- dst[0] = 0xFE;
+ dst[0] = (utf8proc_uint8_t)0xFE;
return 1;
} else if (uc < 0x10000) {
- dst[0] = 0xE0 + (uc >> 12);
- dst[1] = 0x80 + ((uc >> 6) & 0x3F);
- dst[2] = 0x80 + (uc & 0x3F);
+ dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
+ dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
+ dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
return 3;
} else if (uc < 0x110000) {
- dst[0] = 0xF0 + (uc >> 18);
- dst[1] = 0x80 + ((uc >> 12) & 0x3F);
- dst[2] = 0x80 + ((uc >> 6) & 0x3F);
- dst[3] = 0x80 + (uc & 0x3F);
+ dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
+ dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
+ dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
+ dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
return 4;
} else return 0;
}
@@ -391,8 +391,6 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
return s[utf8proc_category(c)];
}
-
-
#define utf8proc_decompose_lump(replacement_uc) \
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
options & ~UTF8PROC_LUMP, last_boundclass)
@@ -486,6 +484,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
) {
+ return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
+}
+
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
+ const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
+ utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
+ utf8proc_custom_func custom_func, void *custom_data
+) {
/* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
utf8proc_ssize_t wpos = 0;
if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
@@ -511,6 +517,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
}
+ if (custom_func != NULL) {
+ uc = custom_func(uc, custom_data); /* user-specified custom mapping */
+ }
decomp_result = utf8proc_decompose_char(
uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
&boundclass
@@ -545,9 +554,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
return wpos;
}
-UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
- /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
- ASSERT: 'buffer' has one spare byte of free space at the end! */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
+ /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
utf8proc_ssize_t rpos;
utf8proc_ssize_t wpos = 0;
@@ -655,6 +663,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
}
length = wpos;
}
+ return length;
+}
+
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
+ /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
+ ASSERT: 'buffer' has one spare byte of free space at the end! */
+ length = utf8proc_normalize_utf32(buffer, length, options);
+ if (length < 0) return length;
{
utf8proc_ssize_t rpos, wpos = 0;
utf8proc_int32_t uc;
@@ -677,14 +693,21 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
) {
+ return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
+}
+
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
+ const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
+ utf8proc_custom_func custom_func, void *custom_data
+) {
utf8proc_int32_t *buffer;
utf8proc_ssize_t result;
*dstptr = NULL;
- result = utf8proc_decompose(str, strlen, NULL, 0, options);
+ result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
if (result < 0) return result;
buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
if (!buffer) return UTF8PROC_ERROR_NOMEM;
- result = utf8proc_decompose(str, strlen, buffer, result, options);
+ result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
if (result < 0) {
free(buffer);
return result;
@@ -730,4 +753,3 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str)
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
return retval;
}
-
diff --git a/utf8proc.h b/utf8proc.h
index 82c0902..edf46d4 100644
--- a/utf8proc.h
+++ b/utf8proc.h
@@ -71,14 +71,15 @@
/** The MAJOR version number (increased when backwards API compatibility is broken). */
#define UTF8PROC_VERSION_MAJOR 2
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
-#define UTF8PROC_VERSION_MINOR 0
+#define UTF8PROC_VERSION_MINOR 1
/** The PATCH version (increased for fixes that do not change the API). */
-#define UTF8PROC_VERSION_PATCH 2
+#define UTF8PROC_VERSION_PATCH 0
/** @} */
#include <stdlib.h>
#include <sys/types.h>
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && _MSC_VER < 1800
+// MSVC prior to 2013 lacked stdbool.h and inttypes.h
typedef signed char utf8proc_int8_t;
typedef unsigned char utf8proc_uint8_t;
typedef short utf8proc_int16_t;
@@ -93,12 +94,18 @@ typedef int utf8proc_ssize_t;
typedef unsigned int utf8proc_size_t;
# endif
# ifndef __cplusplus
+// emulate C99 bool
typedef unsigned char utf8proc_bool;
-enum {false, true};
+# ifndef __bool_true_false_are_defined
+# define false 0
+# define true 1
+# define __bool_true_false_are_defined 1
+# endif
# else
typedef bool utf8proc_bool;
# endif
#else
+# include <stddef.h>
# include <stdbool.h>
# include <inttypes.h>
typedef int8_t utf8proc_int8_t;
@@ -108,7 +115,7 @@ typedef uint16_t utf8proc_uint16_t;
typedef int32_t utf8proc_int32_t;
typedef uint32_t utf8proc_uint32_t;
typedef size_t utf8proc_size_t;
-typedef ssize_t utf8proc_ssize_t;
+typedef ptrdiff_t utf8proc_ssize_t;
typedef bool utf8proc_bool;
#endif
#include <limits.h>
@@ -134,7 +141,7 @@ extern "C" {
#endif
#ifndef UINT16_MAX
-# define UINT16_MAX ~(utf8proc_uint16_t)0
+# define UINT16_MAX 65535U
#endif
/**
@@ -374,6 +381,13 @@ typedef enum {
} utf8proc_boundclass_t;
/**
+ * Function pointer type passed to @ref utf8proc_map_custom and
+ * @ref utf8proc_decompose_custom, which is used to specify a user-defined
+ * mapping of codepoints to be applied in conjunction with other mappings.
+ */
+typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data);
+
+/**
* Array containing the byte lengths of a UTF-8 encoded codepoint based
* on the first byte.
*/
@@ -480,6 +494,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
* `buffer` (which must contain at least `bufsize` entries). In case of
* success, the number of codepoints written is returned; in case of an
* error, a negative error code is returned (@ref utf8proc_errmsg).
+ * See @ref utf8proc_decompose_custom to supply additional transformations.
*
* If the number of written codepoints would be bigger than `bufsize`, the
* required buffer size is returned, while the buffer will be overwritten with
@@ -491,8 +506,46 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
);
/**
+ * The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function
+ * that is called on each codepoint in `str` before any other transformations
+ * (along with a `custom_data` pointer that is passed through to `custom_func`).
+ * The `custom_func` argument is ignored if it is `NULL`. See also @ref utf8proc_map_custom.
+ */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
+ const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
+ utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
+ utf8proc_custom_func custom_func, void *custom_data
+);
+
+/**
+ * Normalizes the sequence of `length` codepoints pointed to by `buffer`
+ * in-place (i.e., the result is also stored in `buffer`).
+ *
+ * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
+ * @param length the length (in codepoints) of the buffer.
+ * @param options a bitwise or (`|`) of one or more of the following flags:
+ * - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS
+ * - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS
+ * - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF
+ * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
+ * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
+ * codepoints
+ * - @ref UTF8PROC_STABLE - prohibit combining characters that would violate
+ * the unicode versioning stability
+ *
+ * @return
+ * In case of success, the length (in codepoints) of the normalized UTF-32 string is
+ * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
+ *
+ * @warning The entries of the array pointed to by `str` have to be in the
+ * range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
+ */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
+
+/**
* Reencodes the sequence of `length` codepoints pointed to by `buffer`
* UTF-8 data in-place (i.e., the result is also stored in `buffer`).
+ * Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.
*
* @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
* @param length the length (in codepoints) of the buffer.
@@ -505,10 +558,12 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
* codepoints
* - @ref UTF8PROC_STABLE - prohibit combining characters that would violate
* the unicode versioning stability
+ * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
*
* @return
- * In case of success, the length (in bytes) of the resulting UTF-8 string is
- * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
+ * In case of success, the length (in bytes) of the resulting nul-terminated
+ * UTF-8 string is returned; otherwise, a negative error code is returned
+ * (@ref utf8proc_errmsg).
*
* @warning The amount of free space pointed to by `buffer` must
* exceed the amount of the input data by one byte, and the
@@ -567,7 +622,7 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
* except that a width of 0 is returned for non-printable codepoints
* instead of -1 as in `wcwidth`.
- *
+ *
* @note
* If you want to check for particular types of non-printable characters,
* (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */
@@ -595,7 +650,8 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi
* in any case the result will be NULL terminated (though it might
* contain NULL characters with the string if `str` contained NULL
* characters). Other flags in the `options` field are passed to the
- * functions defined above, and regarded as described.
+ * functions defined above, and regarded as described. See also
+ * @ref utfproc_map_custom to supply a custom codepoint transformation.
*
* In case of success the length of the new string is returned,
* otherwise a negative error code is returned.
@@ -607,6 +663,17 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
);
+/**
+ * Like @ref utf8proc_map, but also takes a `custom_func` mapping function
+ * that is called on each codepoint in `str` before any other transformations
+ * (along with a `custom_data` pointer that is passed through to `custom_func`).
+ * The `custom_func` argument is ignored if it is `NULL`.
+ */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
+ const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
+ utf8proc_custom_func custom_func, void *custom_data
+);
+
/** @name Unicode normalization
*
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
@@ -619,9 +686,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
/** NFC normalization (@ref UTF8PROC_COMPOSE). */
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
-/** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
+/** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
-/** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
+/** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
/** @} */
@@ -630,4 +697,3 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
#endif
#endif
-
diff --git a/utf8proc_data.c b/utf8proc_data.c
index 02541e0..d8a56bb 100644
--- a/utf8proc_data.c
+++ b/utf8proc_data.c
@@ -5896,7 +5896,7 @@ const utf8proc_uint16_t utf8proc_stage2table[] = {
540, 540, 540, 1180, 0, 0, 0, 0,
0, 1154, 1154, 1154, 1154, 1154, 1154, 1154,
1154, 1154, 1154, 0, 0, 0, 0, 1103,
- 1158, 0, 0, 0, 0, 0, 0, 0,
+ 1103, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
@@ -7847,7 +7847,7 @@ const utf8proc_property_t utf8proc_properties[] = {
{UTF8PROC_CATEGORY_MN, 122, UTF8PROC_BIDI_CLASS_NSM, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 0, 0, UTF8PROC_BOUNDCLASS_EXTEND},
{UTF8PROC_CATEGORY_LO, 0, UTF8PROC_BIDI_CLASS_L, UTF8PROC_DECOMP_TYPE_COMPAT, 9523, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_LO, 0, UTF8PROC_BIDI_CLASS_L, UTF8PROC_DECOMP_TYPE_COMPAT, 9525, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
- {UTF8PROC_CATEGORY_PO, 0, UTF8PROC_BIDI_CLASS_L, UTF8PROC_DECOMP_TYPE_NOBREAK, 1335, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
+ {UTF8PROC_CATEGORY_PO, 0, UTF8PROC_BIDI_CLASS_L, UTF8PROC_DECOMP_TYPE_NOBREAK, 1335, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_MN, 216, UTF8PROC_BIDI_CLASS_NSM, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false, false, false, false, 0, 0, UTF8PROC_BOUNDCLASS_EXTEND},
{UTF8PROC_CATEGORY_PS, 0, UTF8PROC_BIDI_CLASS_ON, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, true, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_PE, 0, UTF8PROC_BIDI_CLASS_ON, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, true, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER},
@@ -10475,7 +10475,7 @@ const utf8proc_property_t utf8proc_properties[] = {
{UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 1470, UINT16_MAX, 1470, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 1478, UINT16_MAX, 1478, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 5132, UINT16_MAX, 5132, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
- {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 1480, UINT16_MAX, 1480, UINT16_MAX, UINT16_MAX, false, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER},
+ {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 1480, UINT16_MAX, 1480, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 5133, UINT16_MAX, 5133, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 5134, UINT16_MAX, 5134, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 1482, UINT16_MAX, 1482, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
@@ -12165,7 +12165,7 @@ const utf8proc_property_t utf8proc_properties[] = {
{UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6787, UINT16_MAX, 6787, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6789, UINT16_MAX, 6789, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6791, UINT16_MAX, 6791, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
- {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6793, UINT16_MAX, 6793, UINT16_MAX, UINT16_MAX, false, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER},
+ {UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6793, UINT16_MAX, 6793, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6795, UINT16_MAX, 6795, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6797, UINT16_MAX, 6797, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_LU, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, 6799, UINT16_MAX, 6799, UINT16_MAX, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
@@ -12201,7 +12201,7 @@ const utf8proc_property_t utf8proc_properties[] = {
{UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9104, UINT16_MAX, 9104, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9106, UINT16_MAX, 9106, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9108, UINT16_MAX, 9108, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
- {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9110, UINT16_MAX, 9110, UINT16_MAX, false, false, false, false, 2, 0, UTF8PROC_BOUNDCLASS_OTHER},
+ {UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9110, UINT16_MAX, 9110, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9112, UINT16_MAX, 9112, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9114, UINT16_MAX, 9114, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
{UTF8PROC_CATEGORY_LL, 0, UTF8PROC_BIDI_CLASS_L, 0, UINT16_MAX, UINT16_MAX, 9116, UINT16_MAX, 9116, UINT16_MAX, false, false, false, false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},
--
utf8proc packaging
More information about the Pkg-julia-devel
mailing list