=provides
__UNDEFINED__
SvUTF8
UTF8f
UTF8fARG
utf8_to_uvchr_buf
sv_len_utf8
sv_len_utf8_nomg
=implementation
#ifdef SVf_UTF8
__UNDEFINED__ SvUTF8(sv) (SvFLAGS(sv) & SVf_UTF8)
#endif
#if { VERSION == 5.19.1 } /* 5.19.1 does not have UTF8fARG, only broken UTF8f */
#undef UTF8f
#endif
#ifdef SVf_UTF8
__UNDEFINED__ UTF8f SVf
__UNDEFINED__ UTF8fARG(u,l,p) newSVpvn_flags((p), (l), ((u) ? SVf_UTF8 : 0) | SVs_TEMP)
#endif
#define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b))
__UNDEFINED__ UNICODE_REPLACEMENT 0xFFFD
#ifdef UTF8_MAXLEN
__UNDEFINED__ UTF8_MAXBYTES UTF8_MAXLEN
#endif
__UNDEF_NOT_PROVIDED__ UTF_START_MARK(len) \
(((len) > 7) ? 0xFF : (0xFF & (0xFE << (7-(len)))))
/* On non-EBCDIC was valid for some releases earlier than this, but easier to
* just do one check */
#if { VERSION < 5.018 }
# undef UTF8_MAXBYTES_CASE
#endif
#if 'A' == 65
# define D_PPP_BYTE_INFO_BITS 6 /* 6 bits meaningful in continuation bytes */
__UNDEFINED__ UTF8_MAXBYTES_CASE 13
#else
# define D_PPP_BYTE_INFO_BITS 5 /* 5 bits meaningful in continuation bytes */
__UNDEFINED__ UTF8_MAXBYTES_CASE 15
#endif
__UNDEF_NOT_PROVIDED__ UTF_ACCUMULATION_SHIFT D_PPP_BYTE_INFO_BITS
#ifdef NATIVE_TO_UTF
__UNDEF_NOT_PROVIDED__ NATIVE_UTF8_TO_I8(c) NATIVE_TO_UTF(c)
#else /* System doesn't support EBCDIC */
__UNDEF_NOT_PROVIDED__ NATIVE_UTF8_TO_I8(c) (c)
#endif
#ifdef UTF_TO_NATIVE
__UNDEF_NOT_PROVIDED__ I8_TO_NATIVE_UTF8(c) UTF_TO_NATIVE(c)
#else /* System doesn't support EBCDIC */
__UNDEF_NOT_PROVIDED__ I8_TO_NATIVE_UTF8(c) (c)
#endif
__UNDEF_NOT_PROVIDED__ UTF_START_MASK(len) \
(((len) >= 7) ? 0x00 : (0x1F >> ((len)-2)))
__UNDEF_NOT_PROVIDED__ UTF_IS_CONTINUATION_MASK \
((U8) (0xFF << UTF_ACCUMULATION_SHIFT))
__UNDEF_NOT_PROVIDED__ UTF_CONTINUATION_MARK \
(UTF_IS_CONTINUATION_MASK & 0xB0)
__UNDEF_NOT_PROVIDED__ UTF_MIN_START_BYTE \
((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
__UNDEF_NOT_PROVIDED__ UTF_MIN_ABOVE_LATIN1_BYTE \
((0x100 >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
#if { VERSION < 5.007 } /* Was the complement of what should have been */
# undef UTF8_IS_DOWNGRADEABLE_START
#endif
__UNDEF_NOT_PROVIDED__ UTF8_IS_DOWNGRADEABLE_START(c) \
inRANGE(NATIVE_UTF8_TO_I8(c), \
UTF_MIN_START_BYTE, UTF_MIN_ABOVE_LATIN1_BYTE - 1)
__UNDEF_NOT_PROVIDED__ UTF_CONTINUATION_MASK \
((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1))
__UNDEF_NOT_PROVIDED__ UTF8_ACCUMULATE(base, added) \
(((base) << UTF_ACCUMULATION_SHIFT) \
| ((NATIVE_UTF8_TO_I8(added)) \
& UTF_CONTINUATION_MASK))
__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_ANYUV 0
__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_EMPTY 0x0001
__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_CONTINUATION 0x0002
__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_NON_CONTINUATION 0x0004
__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_SHORT 0x0008
__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_LONG 0x0010
__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_OVERFLOW 0x0080
__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \
|UTF8_ALLOW_NON_CONTINUATION \
|UTF8_ALLOW_SHORT \
|UTF8_ALLOW_LONG \
|UTF8_ALLOW_OVERFLOW)
#if defined UTF8SKIP
/* Don't use official versions because they use MIN, which may not be available */
#undef UTF8_SAFE_SKIP
#undef UTF8_CHK_SKIP
__UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \
((((e) - (s)) <= 0) \
? 0 \
: D_PPP_MIN(((e) - (s)), UTF8SKIP(s))))
__UNDEFINED__ UTF8_CHK_SKIP(s) \
(s[0] == '\0' ? 1 : ((U8) D_PPP_MIN(my_strnlen((char *) (s), UTF8SKIP(s)), \
UTF8SKIP(s))))
/* UTF8_CHK_SKIP depends on my_strnlen */
__UNDEFINED__ UTF8_SKIP(s) UTF8SKIP(s)
#endif
#if 'A' == 65
__UNDEFINED__ UTF8_IS_INVARIANT(c) isASCII(c)
#else
__UNDEFINED__ UTF8_IS_INVARIANT(c) (isASCII(c) || isCNTRL_L1(c))
#endif
__UNDEFINED__ UVCHR_IS_INVARIANT(c) UTF8_IS_INVARIANT(c)
#ifdef UVCHR_IS_INVARIANT
# if 'A' != 65 || UVSIZE < 8
/* 32 bit platform, which includes UTF-EBCDIC on the releases this is
* backported to */
# define D_PPP_UVCHR_SKIP_UPPER(c) 7
# else
# define D_PPP_UVCHR_SKIP_UPPER(c) \
(((WIDEST_UTYPE) (c)) < \
(((WIDEST_UTYPE) 1) << (6 * D_PPP_BYTE_INFO_BITS)) ? 7 : 13)
# endif
__UNDEFINED__ UVCHR_SKIP(c) \
UVCHR_IS_INVARIANT(c) ? 1 : \
(WIDEST_UTYPE) (c) < (32 * (1U << ( D_PPP_BYTE_INFO_BITS))) ? 2 : \
(WIDEST_UTYPE) (c) < (16 * (1U << (2 * D_PPP_BYTE_INFO_BITS))) ? 3 : \
(WIDEST_UTYPE) (c) < ( 8 * (1U << (3 * D_PPP_BYTE_INFO_BITS))) ? 4 : \
(WIDEST_UTYPE) (c) < ( 4 * (1U << (4 * D_PPP_BYTE_INFO_BITS))) ? 5 : \
(WIDEST_UTYPE) (c) < ( 2 * (1U << (5 * D_PPP_BYTE_INFO_BITS))) ? 6 : \
D_PPP_UVCHR_SKIP_UPPER(c)
#endif
#ifdef is_ascii_string
__UNDEFINED__ is_invariant_string(s,l) is_ascii_string(s,l)
__UNDEFINED__ is_utf8_invariant_string(s,l) is_ascii_string(s,l)
/* Hint: is_ascii_string, is_invariant_string
is_utf8_invariant_string() does the same thing and is preferred because its
name is more accurate as to what it does */
#endif
#ifdef ibcmp_utf8
__UNDEFINED__ foldEQ_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2) \
cBOOL(! ibcmp_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2))
#endif
#if defined(is_utf8_string) && defined(UTF8SKIP)
__UNDEFINED__ isUTF8_CHAR(s, e) ( \
(e) <= (s) || ! is_utf8_string(s, UTF8_SAFE_SKIP(s, e)) \
? 0 \
: UTF8SKIP(s))
#endif
#if 'A' == 65
__UNDEFINED__ BOM_UTF8 "\xEF\xBB\xBF"
__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xEF\xBF\xBD"
#elif '^' == 95
__UNDEFINED__ BOM_UTF8 "\xDD\x73\x66\x73"
__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x73\x73\x71"
#elif '^' == 176
__UNDEFINED__ BOM_UTF8 "\xDD\x72\x65\x72"
__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x72\x72\x70"
#else
# error Unknown character set
#endif
#if { VERSION < 5.35.10 }
/* Versions prior to 5.31.4 accepted things that are now considered
* malformations, and didn't return -1 on error with warnings enabled.
* Versions before 5.35.10 dereferenced empty input without checking */
# undef utf8_to_uvchr_buf
#endif
/* This implementation brings modern, generally more restricted standards to
* utf8_to_uvchr_buf. Some of these are security related, and clearly must
* be done. But its arguable that the others need not, and hence should not.
* The reason they're here is that a module that intends to play with the
* latest perls should be able to work the same in all releases. An example is
* that perl no longer accepts any UV for a code point, but limits them to
* IV_MAX or below. This is for future internal use of the larger code points.
* If it turns out that some of these changes are breaking code that isn't
* intended to work with modern perls, the tighter restrictions could be
* relaxed. khw thinks this is unlikely, but has been wrong in the past. */
/* 5.6.0 is the first release with UTF-8, and we don't implement this function
* there due to its likely lack of still being in use, and the underlying
* implementation is very different from later ones, without the later
* safeguards, so would require extra work to deal with */
#if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf)
/* Choose which underlying implementation to use. At least one must be
* present or the perl is too early to handle this function */
# if defined(utf8n_to_uvchr) || defined(utf8_to_uvchr) || defined(utf8_to_uv)
# if defined(utf8n_to_uvchr) /* This is the preferred implementation */
# define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr
# elif /* Must be at least 5.6.1 from #if above; \
If have both regular and _simple, regular has all args */ \
defined(utf8_to_uv) && defined(utf8_to_uv_simple)
# define D_PPP_utf8_to_uvchr_buf_callee utf8_to_uv
# elif defined(utf8_to_uvchr) /* The below won't work well on error input */
# define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) \
utf8_to_uvchr((U8 *)(s), (retlen))
# else
# define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) \
utf8_to_uv((U8 *)(s), (retlen))
# endif
# endif
# if { NEED utf8_to_uvchr_buf }
UV
utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
{
# if { VERSION >= 5.31.4 } /* But from above, must be < 5.35.10 */
# if { VERSION != 5.35.9 }
/* Versions less than 5.35.9 could dereference s on zero length, so
* pass it something where no harm comes from that. */
if (send <= s) s = send = (U8 *) "?";
return Perl_utf8_to_uvchr_buf_helper(aTHX_ s, send, retlen);
# else /* Below is 5.35.9, which also works on non-empty input, but
for empty input, can wrongly dereference, and additionally is
also just plain broken */
if (send > s) return Perl_utf8_to_uvchr_buf_helper(aTHX_ s, send, retlen);
if (! ckWARN_d(WARN_UTF8)) {
if (retlen) *retlen = 0;
return UNICODE_REPLACEMENT;
}
else {
s = send = (U8 *) "?";
/* Call just for its warning */
(void) Perl__utf8n_to_uvchr_msgs_helper(s, 0, NULL, 0, NULL, NULL);
if (retlen) *retlen = (STRLEN) -1;
return 0;
}
# endif
# else
UV ret;
STRLEN curlen;
bool overflows = 0;
const U8 *cur_s = s;
const bool do_warnings = ckWARN_d(WARN_UTF8);
# if { VERSION < 5.26.0 } && ! defined(EBCDIC)
STRLEN overflow_length = 0;
# endif
if (send > s) {
curlen = send - s;
}
else {
assert(0); /* Modern perls die under this circumstance */
curlen = 0;
if (! do_warnings) { /* Handle empty here if no warnings needed */
if (retlen) *retlen = 0;
return UNICODE_REPLACEMENT;
}
}
# if { VERSION < 5.26.0 } && ! defined(EBCDIC)
/* Perl did not properly detect overflow for much of its history on
* non-EBCDIC platforms, often returning an overlong value which may or may
* not have been tolerated in the call. Also, earlier versions, when they
* did detect overflow, may have disallowed it completely. Modern ones can
* replace it with the REPLACEMENT CHARACTER, depending on calling
* parameters. Therefore detect it ourselves in releases it was
* problematic in. */
if (curlen > 0 && UNLIKELY(*s >= 0xFE)) {
/* First, on a 32-bit machine the first byte being at least \xFE
* automatically is overflow, as it indicates something requiring more
* than 31 bits */
if (sizeof(ret) < 8) {
overflows = 1;
overflow_length = (*s == 0xFE) ? 7 : 13;
}
else {
const U8 highest[] = /* 2*63-1 */
"\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
const U8 *cur_h = highest;
for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
if (UNLIKELY(*cur_s == *cur_h)) {
continue;
}
/* If this byte is larger than the corresponding highest UTF-8
* byte, the sequence overflows; otherwise the byte is less
* than (as we handled the equality case above), and so the
* sequence doesn't overflow */
overflows = *cur_s > *cur_h;
break;
}
/* Here, either we set the bool and broke out of the loop, or got
* to the end and all bytes are the same which indicates it doesn't
* overflow. If it did overflow, it would be this number of bytes
* */
overflow_length = 13;
}
}
if (UNLIKELY(overflows)) {
ret = 0;
if (! do_warnings && retlen) {
*retlen = overflow_length;
}
}
else
# endif /* < 5.26 */
/* Here, we are either in a release that properly detects overflow, or
* we have checked for overflow and the next statement is executing as
* part of the above conditional where we know we don't have overflow.
*
* The modern versions allow anything that evaluates to a legal UV, but
* not overlongs nor an empty input */
ret = D_PPP_utf8_to_uvchr_buf_callee(
(U8 *) /* Early perls: no const */
s, curlen, retlen, (UTF8_ALLOW_ANYUV
& ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));
# if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 }
/* But actually, more modern versions restrict the UV to being no more than
* what an IV can hold, so it could still have gotten it wrong about
* overflowing. */
if (UNLIKELY(ret > IV_MAX)) {
overflows = 1;
}
# endif
if (UNLIKELY(overflows)) {
if (! do_warnings) {
if (retlen) {
*retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
*retlen = D_PPP_MIN(*retlen, curlen);
}
return UNICODE_REPLACEMENT;
}
else {
/* We use the error message in use from 5.8-5.26 */
Perl_warner(aTHX_ packWARN(WARN_UTF8),
"Malformed UTF-8 character (overflow at 0x%" UVxf
", byte 0x%02x, after start byte 0x%02x)",
ret, *cur_s, *s);
if (retlen) {
*retlen = (STRLEN) -1;
}
return 0;
}
}
/* Here, did not overflow, but if it failed for some other reason, and
* warnings are off, to emulate the behavior of the real utf8_to_uvchr(),
* try again, allowing anything. (Note a return of 0 is ok if the input
* was '\0') */
if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {
/* If curlen is 0, we already handled the case where warnings are
* disabled, so this 'if' will be true, and so later on, we know that
* 's' is dereferencible */
if (do_warnings) {
if (retlen) {
*retlen = (STRLEN) -1;
}
}
else {
ret = D_PPP_utf8_to_uvchr_buf_callee(
(U8 *) /* Early perls: no const */
s, curlen, retlen, UTF8_ALLOW_ANY);
/* Override with the REPLACEMENT character, as that is what the
* modern version of this function returns */
ret = UNICODE_REPLACEMENT;
# if { VERSION < 5.16.0 }
/* Versions earlier than this don't necessarily return the proper
* length. It should not extend past the end of string, nor past
* what the first byte indicates the length is, nor past the
* continuation characters */
if (retlen && (IV) *retlen >= 0) {
unsigned int i = 1;
*retlen = D_PPP_MIN(*retlen, curlen);
*retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
do {
# ifdef UTF8_IS_CONTINUATION
if (! UTF8_IS_CONTINUATION(s[i]))
# else /* Versions without the above don't support EBCDIC anyway */
if (s[i] < 0x80 || s[i] > 0xBF)
# endif
{
*retlen = i;
break;
}
} while (++i < *retlen);
}
# endif /* end of < 5.16.0 */
}
}
return ret;
# endif /* end of < 5.31.4 */
}
# endif
#endif
#if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf)
#undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses
to read past a NUL, making it much less likely to read
off the end of the buffer. A NUL indicates the start
of the next character anyway. If the input isn't
NUL-terminated, the function remains unsafe, as it
always has been. */
__UNDEFINED__ utf8_to_uvchr(s, lp) \
((*(s) == '\0') \
? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \
: utf8_to_uvchr_buf(s, (s) + UTF8_CHK_SKIP(s), (lp)))
#endif
/* Hint: utf8_to_uvchr
Use utf8_to_uvchr_buf() instead. But ONLY if you KNOW the upper bound
of the input string (not resorting to using UTF8SKIP, etc., to infer it).
The backported utf8_to_uvchr() will do a better job to prevent most cases
of trying to read beyond the end of the buffer */
/* Replace utf8_to_uvchr with utf8_to_uvchr_buf */
#ifdef sv_len_utf8
# if { VERSION >= 5.17.5 }
# ifndef sv_len_utf8_nomg
# if defined(PERL_USE_GCC_BRACE_GROUPS)
# define sv_len_utf8_nomg(sv) \
({ \
SV *sv_ = (sv); \
sv_len_utf8(!SvGMAGICAL(sv_) \
? sv_ \
: sv_mortalcopy_flags(sv_, SV_NOSTEAL)); \
})
# else
PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8_nomg(SV * sv)
{
dTHX;
if (SvGMAGICAL(sv))
return sv_len_utf8(sv_mortalcopy_flags(sv,
SV_NOSTEAL));
else return sv_len_utf8(sv);
}
# define sv_len_utf8_nomg(sv) D_PPP_sv_len_utf8_nomg(sv)
# endif
# endif
# else /* < 5.17.5 */
/* Older Perl versions have broken sv_len_utf8() when passed sv does not
* have SVf_UTF8 flag set */
/* Also note that SvGETMAGIC() may change presence of SVf_UTF8 flag */
# undef sv_len_utf8
# if defined(PERL_USE_GCC_BRACE_GROUPS)
# define sv_len_utf8_nomg(sv) \
({ \
SV *sv2 = (sv); \
STRLEN len; \
if (SvUTF8(sv2)) { \
if (SvGMAGICAL(sv2)) \
len = Perl_sv_len_utf8(aTHX_ \
sv_mortalcopy_flags(sv2, \
SV_NOSTEAL));\
else \
len = Perl_sv_len_utf8(aTHX_ sv2); \
} \
else SvPV_nomg(sv2, len); \
len; \
})
# define sv_len_utf8(sv) ({ SV *_sv1 = (sv); \
SvGETMAGIC(_sv1); \
sv_len_utf8_nomg(_sv1); \
})
# else /* Below is no brace groups */
PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8_nomg(SV * sv)
{
dTHX;
STRLEN len;
if (SvUTF8(sv)) {
if (SvGMAGICAL(sv))
len = Perl_sv_len_utf8(aTHX_
sv_mortalcopy_flags(sv,
SV_NOSTEAL));
else
len = Perl_sv_len_utf8(aTHX_ sv);
}
else SvPV_nomg(sv, len);
return len;
}
# define sv_len_utf8_nomg(sv) D_PPP_sv_len_utf8_nomg(sv)
PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8(SV * sv)
{
dTHX;
SvGETMAGIC(sv);
return sv_len_utf8_nomg(sv);
}
# define sv_len_utf8(sv) D_PPP_sv_len_utf8(sv)
# endif
# endif /* End of < 5.17.5 */
#endif
=xsinit
#define NEED_utf8_to_uvchr_buf
=xsubs
#if defined(UTF8f) && defined(newSVpvf)
void
UTF8f(x)
SV *x
PREINIT:
U32 u;
STRLEN len;
char *ptr;
INIT:
ptr = SvPV(x, len);
u = SvUTF8(x);
PPCODE:
x = sv_2mortal(newSVpvf("[%" UTF8f "]", UTF8fARG(u, len, ptr)));
XPUSHs(x);
XSRETURN(1);
#endif
#if { VERSION >= 5.006 } /* This is just a helper fcn, not publicized */ \
/* as being available and params not what the */ \
/* API function has; works on EBCDIC too */
SV *
uvchr_to_utf8(native)
UV native
PREINIT:
int len;
U8 string[UTF8_MAXBYTES+1];
int i;
UV uni;
CODE:
len = UVCHR_SKIP(native);
for (i = 0; i < len; i++) {
string[i] = '\0';
}
if (len <= 1) {
string[0] = native;
}
else {
i = len;
uni = NATIVE_TO_UNI(native);
while (i-- > 1) {
string[i] = I8_TO_NATIVE_UTF8((uni & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
uni >>= UTF_ACCUMULATION_SHIFT;
}
string[0] = I8_TO_NATIVE_UTF8((uni & UTF_START_MASK(len)) | UTF_START_MARK(len));
}
RETVAL = newSVpvn((char *) string, len);
SvUTF8_on(RETVAL);
OUTPUT:
RETVAL
#endif
#if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP)
STRLEN
UTF8_SAFE_SKIP(s, adjustment)
char * s
int adjustment
PREINIT:
const char *const_s;
CODE:
const_s = s;
/* Instead of passing in an 'e' ptr, use the real end, adjusted */
RETVAL = UTF8_SAFE_SKIP(const_s, s + UTF8SKIP(s) + adjustment);
OUTPUT:
RETVAL
#endif
#ifdef isUTF8_CHAR
STRLEN
isUTF8_CHAR(s, adjustment)
unsigned char * s
int adjustment
PREINIT:
const unsigned char *const_s;
const unsigned char *const_e;
CODE:
const_s = s;
/* Instead of passing in an 'e' ptr, use the real end, adjusted */
const_e = const_s + UTF8SKIP(const_s) + adjustment;
RETVAL = isUTF8_CHAR(const_s, const_e);
OUTPUT:
RETVAL
#endif
#ifdef foldEQ_utf8
STRLEN
foldEQ_utf8(s1, l1, u1, s2, l2, u2)
char *s1
UV l1
bool u1
char *s2
UV l2
bool u2
PREINIT:
const char *const_s1;
const char *const_s2;
CODE:
const_s1 = s1;
const_s2 = s2;
RETVAL = foldEQ_utf8(const_s1, NULL, l1, u1, const_s2, NULL, l2, u2);
OUTPUT:
RETVAL
#endif
#ifdef utf8_to_uvchr_buf
AV *
utf8_to_uvchr_buf(s, adjustment)
unsigned char *s
int adjustment
PREINIT:
AV *av;
STRLEN len;
const unsigned char *const_s;
CODE:
av = newAV();
const_s = s;
av_push(av, newSVuv(utf8_to_uvchr_buf(const_s,
s + UTF8SKIP(s) + adjustment,
&len)));
if (len == (STRLEN) -1) {
av_push(av, newSViv(-1));
}
else {
av_push(av, newSVuv(len));
}
RETVAL = av;
OUTPUT:
RETVAL
#endif
#ifdef utf8_to_uvchr
AV *
utf8_to_uvchr(s)
unsigned char *s
PREINIT:
AV *av;
STRLEN len;
const unsigned char *const_s;
CODE:
av = newAV();
const_s = s;
av_push(av, newSVuv(utf8_to_uvchr(const_s, &len)));
if (len == (STRLEN) -1) {
av_push(av, newSViv(-1));
}
else {
av_push(av, newSVuv(len));
}
RETVAL = av;
OUTPUT:
RETVAL
#endif
#ifdef sv_len_utf8
STRLEN
sv_len_utf8(sv)
SV *sv
CODE:
RETVAL = sv_len_utf8(sv);
OUTPUT:
RETVAL
#endif
#ifdef sv_len_utf8_nomg
STRLEN
sv_len_utf8_nomg(sv)
SV *sv
CODE:
RETVAL = sv_len_utf8_nomg(sv);
OUTPUT:
RETVAL
#endif
#ifdef UVCHR_IS_INVARIANT
bool
UVCHR_IS_INVARIANT(c)
unsigned c
PREINIT:
CODE:
RETVAL = UVCHR_IS_INVARIANT(c);
OUTPUT:
RETVAL
#endif
#ifdef UVCHR_SKIP
STRLEN
UVCHR_SKIP(c)
UV c
PREINIT:
CODE:
RETVAL = UVCHR_SKIP(c);
OUTPUT:
RETVAL
#endif
=tests plan => 98
BEGIN {
# skip tests on 5.6.0 and earlier, plus 5.7.0
if (ivers($]) <= ivers(5.6) || ivers($]) == ivers(5.7) ) {
skip 'skip: broken utf8 support', 98;
exit;
}
require warnings;
}
is(Devel::PPPort::UTF8f(42), '[42]');
is(Devel::PPPort::UTF8f('abc'), '[abc]');
is(Devel::PPPort::UTF8f("\x{263a}"), "[\x{263a}]");
my $str = "\x{A8}";
if (ivers($]) >= ivers(5.8)) { eval q{utf8::upgrade($str)} }
is(Devel::PPPort::UTF8f($str), "[\x{A8}]");
if (ivers($]) >= ivers(5.8)) { eval q{utf8::downgrade($str)} }
is(Devel::PPPort::UTF8f($str), "[\x{A8}]");
is(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1);
is(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);
is(&Devel::PPPort::isUTF8_CHAR("A", -1), 0);
is(&Devel::PPPort::isUTF8_CHAR("A", 0), 1);
is(&Devel::PPPort::isUTF8_CHAR("\x{100}", -1), 0);
is(&Devel::PPPort::isUTF8_CHAR("\x{100}", 0), 2);
is(&Devel::PPPort::UVCHR_IS_INVARIANT(ord("A")), 1);
ok(! &Devel::PPPort::UVCHR_IS_INVARIANT(0xb6));
ok(! &Devel::PPPort::UVCHR_IS_INVARIANT(0x100));
is(&Devel::PPPort::UVCHR_SKIP(ord("A")), 1);
is(&Devel::PPPort::UVCHR_SKIP(0xb6), 2, "This is a test");
is(&Devel::PPPort::UVCHR_SKIP(0x3FF), 2);
is(&Devel::PPPort::UVCHR_SKIP(0x3FFF), 3);
is(&Devel::PPPort::UVCHR_SKIP(0x3FFFF), 4);
is(&Devel::PPPort::UVCHR_SKIP(0x3FFFFF), 5);
is(&Devel::PPPort::UVCHR_SKIP(0x3FFFFFF), ord("A") == 65 ? 5 : 6);
is(&Devel::PPPort::UVCHR_SKIP(0x4000000), ord("A") == 65 ? 6 : 7);
if (ord("A") != 65) {
skip("Test not valid on EBCDIC", 1)
}
else {
is(&Devel::PPPort::UVCHR_SKIP(0xFFFFFFFF), 7);
}
if (ivers($]) < ivers(5.8)) {
skip("Perl version too early", 3);
}
else {
is(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{101}", 3, 1), 1);
is(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{102}", 3, 1), 0);
is(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "b\x{101}", 3, 1), 0);
}
my $ret = &Devel::PPPort::utf8_to_uvchr("A");
is($ret->[0], ord("A"));
is($ret->[1], 1);
$ret = &Devel::PPPort::utf8_to_uvchr("\0");
is($ret->[0], 0);
is($ret->[1], 1);
$ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0);
is($ret->[0], ord("A"));
is($ret->[1], 1);
$ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0);
is($ret->[0], 0);
is($ret->[1], 1);
my @buf_tests = (
{
input => "A",
adjustment => -1,
warning => eval "qr/empty/",
no_warnings_returned_length => 0,
},
{
input => "\xc4\xc5",
adjustment => 0,
warning => eval "qr/non-continuation/",
no_warnings_returned_length => 1,
},
{
input => "\xc4\x80",
adjustment => -1,
warning => eval "qr/short|1 byte, need 2/",
no_warnings_returned_length => 1,
},
{
input => "\xc0\x81",
adjustment => 0,
warning => eval "qr/overlong|2 bytes, need 1/",
no_warnings_returned_length => 2,
},
{
input => "\xe0\x80\x81",
adjustment => 0,
warning => eval "qr/overlong|3 bytes, need 1/",
no_warnings_returned_length => 3,
},
{
input => "\xf0\x80\x80\x81",
adjustment => 0,
warning => eval "qr/overlong|4 bytes, need 1/",
no_warnings_returned_length => 4,
},
{ # Old algorithm failed to detect this
input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
adjustment => 0,
warning => eval "qr/overflow/",
no_warnings_returned_length => 13,
},
);
if (ord("A") != 65) { # tests not valid for EBCDIC
skip("Perl version too early", 2 + 4 + (scalar @buf_tests * 5));
}
else {
$ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0);
is($ret->[0], 0x100);
is($ret->[1], 2);
my @warnings;
local $SIG{__WARN__} = sub { push @warnings, @_; };
{
use warnings 'utf8';
$ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
is($ret->[0], 0);
is($ret->[1], -1);
no warnings 'utf8';
$ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
is($ret->[0], 0xFFFD);
is($ret->[1], 1);
}
# An empty input is an assertion failure on debugging builds. It is
# deliberately the first test.
require Config; Config->import;
use vars '%Config';
# VMS doesn't put DEBUGGING in ccflags, and Windows doesn't have
# $Config{config_args}. When 5.14 or later can be assumed, use
# Config::non_bincompat_options(), but for now we're stuck with this.
if ( $Config{ccflags} =~ /-DDEBUGGING/
|| $^O eq 'VMS' && $Config{config_args} =~ /\bDDEBUGGING\b/)
{
shift @buf_tests;
skip("Test not valid on DEBUGGING builds", 5);
}
my $test;
for $test (@buf_tests) {
my $input = $test->{'input'};
my $adjustment = $test->{'adjustment'};
my $display = 'utf8_to_uvchr_buf("';
my $i;
for ($i = 0; $i < length($input) + $adjustment; $i++) {
$display .= sprintf "\\x%02x", ord substr($input, $i, 1);
}
$display .= '")';
my $warning = $test->{'warning'};
undef @warnings;
use warnings 'utf8';
$ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
is($ret->[0], 0, "returned value $display; warnings enabled");
is($ret->[1], -1, "returned length $display; warnings enabled");
my $all_warnings = join "; ", @warnings;
my $contains = grep { $_ =~ $warning } $all_warnings;
is($contains, 1, $display
. "; Got: '$all_warnings', which should contain '$warning'");
undef @warnings;
no warnings 'utf8';
$ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
is($ret->[0], 0xFFFD, "returned value $display; warnings disabled");
is($ret->[1], $test->{'no_warnings_returned_length'},
"returned length $display; warnings disabled");
}
}
if (ivers($]) ge ivers(5.008)) {
BEGIN { if (ivers($]) ge ivers(5.008)) { require utf8; "utf8"->import() } }
is(Devel::PPPort::sv_len_utf8("aščť"), 4);
is(Devel::PPPort::sv_len_utf8_nomg("aščť"), 4);
my $str = "áíé";
utf8::downgrade($str);
is(Devel::PPPort::sv_len_utf8($str), 3);
utf8::downgrade($str);
is(Devel::PPPort::sv_len_utf8_nomg($str), 3);
utf8::upgrade($str);
is(Devel::PPPort::sv_len_utf8($str), 3);
utf8::upgrade($str);
is(Devel::PPPort::sv_len_utf8_nomg($str), 3);
tie my $scalar, 'TieScalarCounter', "é";
is(tied($scalar)->{fetch}, 0);
is(tied($scalar)->{store}, 0);
is(Devel::PPPort::sv_len_utf8($scalar), 2);
is(tied($scalar)->{fetch}, 1);
is(tied($scalar)->{store}, 0);
is(Devel::PPPort::sv_len_utf8($scalar), 3);
is(tied($scalar)->{fetch}, 2);
is(tied($scalar)->{store}, 0);
is(Devel::PPPort::sv_len_utf8($scalar), 4);
is(tied($scalar)->{fetch}, 3);
is(tied($scalar)->{store}, 0);
is(Devel::PPPort::sv_len_utf8_nomg($scalar), 4);
is(tied($scalar)->{fetch}, 3);
is(tied($scalar)->{store}, 0);
is(Devel::PPPort::sv_len_utf8_nomg($scalar), 4);
is(tied($scalar)->{fetch}, 3);
is(tied($scalar)->{store}, 0);
} else {
skip 'skip: no utf8::downgrade/utf8::upgrade support', 23;
}
package TieScalarCounter;
sub TIESCALAR {
my ($class, $value) = @_;
return bless { fetch => 0, store => 0, value => $value }, $class;
}
sub FETCH {
BEGIN { if (main::ivers($]) ge main::ivers(5.008)) { require utf8; "utf8"->import() } }
my ($self) = @_;
$self->{fetch}++;
return $self->{value} .= "é";
}
sub STORE {
my ($self, $value) = @_;
$self->{store}++;
$self->{value} = $value;
}