From Code to Community: Sponsoring The Perl and Raku Conference 2025 Learn more

#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#define PkgName "Unicode::Transform"
/* Some functions are defined in this. */
#include "unitrans.h"
#define Num_UTFs_here (8)
/* in the range of valid Unicode (0..10FFFF) */
static STDCHAR
MaxLenAmplUni[Num_UTFs_here * Num_UTFs_here] = {
1, 2, 2, 4, 4, 2, 2, 2,
2, 1, 1, 2, 2, 2, 2, 2,
2, 1, 1, 2, 2, 2, 2, 2,
2, 1, 1, 1, 1, 1, 2, 2,
2, 1, 1, 1, 1, 1, 2, 2,
2, 2, 2, 4, 4, 1, 2, 2,
2, 2, 2, 4, 4, 2, 1, 1,
2, 2, 2, 4, 4, 2, 1, 1,
};
static UV (*ord_uv_in[Num_UTFs_here])(U8 *, STRLEN, STRLEN *) = {
ord_in_unicode,
ord_in_utf16le,
ord_in_utf16be,
ord_in_utf32le,
ord_in_utf32be,
ord_in_utf8,
ord_in_utf8mod,
ord_in_utfcp1047,
};
static U8* (*app_uv_in[Num_UTFs_here])(U8 *, UV) = {
app_in_unicode,
app_in_utf16le,
app_in_utf16be,
app_in_utf32le,
app_in_utf32be,
app_in_utf8,
app_in_utf8mod,
app_in_utfcp1047,
};
static void
sv_cat_retcvref (SV *dst, SV *cv, SV *sv)
{
dSP;
int count;
ENTER;
SAVETMPS;
PUSHMARK(SP);
XPUSHs(sv_2mortal(sv));
PUTBACK;
count = call_sv(cv, G_EVAL|G_SCALAR);
SPAGAIN;
if (SvTRUE(ERRSV) || count != 1) {
croak("died in subroutine call from XS, " PkgName "\n");
}
sv_catsv(dst,POPs);
PUTBACK;
FREETMPS;
LEAVE;
}
MODULE = Unicode::Transform PACKAGE = Unicode::Transform
SV*
unicode_to_unicode (arg1, arg2=0)
SV* arg1
SV* arg2
PROTOTYPE: $;$
ALIAS:
unicode_to_utf16le = 1
unicode_to_utf16be = 2
unicode_to_utf32le = 3
unicode_to_utf32be = 4
unicode_to_utf8 = 5
unicode_to_utf8mod = 6
unicode_to_utfcp1047 = 7
utf16le_to_unicode = 8
utf16le_to_utf16le = 9
utf16le_to_utf16be = 10
utf16le_to_utf32le = 11
utf16le_to_utf32be = 12
utf16le_to_utf8 = 13
utf16le_to_utf8mod = 14
utf16le_to_utfcp1047 = 15
utf16be_to_unicode = 16
utf16be_to_utf16le = 17
utf16be_to_utf16be = 18
utf16be_to_utf32le = 19
utf16be_to_utf32be = 20
utf16be_to_utf8 = 21
utf16be_to_utf8mod = 22
utf16be_to_utfcp1047 = 23
utf32le_to_unicode = 24
utf32le_to_utf16le = 25
utf32le_to_utf16be = 26
utf32le_to_utf32le = 27
utf32le_to_utf32be = 28
utf32le_to_utf8 = 29
utf32le_to_utf8mod = 30
utf32le_to_utfcp1047 = 31
utf32be_to_unicode = 32
utf32be_to_utf16le = 33
utf32be_to_utf16be = 34
utf32be_to_utf32le = 35
utf32be_to_utf32be = 36
utf32be_to_utf8 = 37
utf32be_to_utf8mod = 38
utf32be_to_utfcp1047 = 39
utf8_to_unicode = 40
utf8_to_utf16le = 41
utf8_to_utf16be = 42
utf8_to_utf32le = 43
utf8_to_utf32be = 44
utf8_to_utf8 = 45
utf8_to_utf8mod = 46
utf8_to_utfcp1047 = 47
utf8mod_to_unicode = 48
utf8mod_to_utf16le = 49
utf8mod_to_utf16be = 50
utf8mod_to_utf32le = 51
utf8mod_to_utf32be = 52
utf8mod_to_utf8 = 53
utf8mod_to_utf8mod = 54
utf8mod_to_utfcp1047 = 55
utfcp1047_to_unicode = 56
utfcp1047_to_utf16le = 57
utfcp1047_to_utf16be = 58
utfcp1047_to_utf32le = 59
utfcp1047_to_utf32be = 60
utfcp1047_to_utf8 = 61
utfcp1047_to_utf8mod = 62
utfcp1047_to_utfcp1047 = 63
PREINIT:
SV *src, *dst, *cvref;
STRLEN srclen, dstlen, retlen, ulen;
U8 *s, *e, *p, *d, ubuf[UTF8_MAXLEN + 1];
UV uv;
UV (*ord_uv)(U8 *, STRLEN, STRLEN *);
U8* (*app_uv)(U8*, UV);
int from_utf_num, to_utf_num;
bool from_unicode, to_unicode;
CODE:
cvref = NULL;
if (SvROK(arg1)) {
if (SvTYPE(SvRV(arg1)) == SVt_PVCV)
cvref = SvRV(arg1);
else
croak(PkgName " CALLBACK is not a CODEREF");
}
src = cvref
? items == 1 ? &PL_sv_undef : arg2
: arg1;
from_utf_num = ix / Num_UTFs_here;
to_utf_num = ix % Num_UTFs_here;
from_unicode = from_utf_num == 0;
to_unicode = to_utf_num == 0;
if (!from_unicode && SvUTF8(src)) {
src = sv_mortalcopy(src);
sv_utf8_downgrade(src, 0);
}
else if (from_unicode && !SvUTF8(src)) {
src = sv_mortalcopy(src);
sv_utf8_upgrade(src);
}
s = (U8*)SvPV(src,srclen);
e = s + srclen;
dstlen = srclen * MaxLenAmplUni[ix] + 1;
dst = newSV(dstlen);
(void)SvPOK_only(dst);
if (to_unicode) {
SvUTF8_on(dst);
}
ord_uv = ord_uv_in[from_utf_num];
app_uv = app_uv_in[to_utf_num];
if (cvref) {
for (p = s; p < e;) {
uv = ord_uv(p, e - p, &retlen);
if (retlen)
p += retlen;
else
uv = (UV)*p++;
if (retlen && Is_VALID_UTF(uv)) {
ulen = app_uv(ubuf, uv) - ubuf;
sv_catpvn(dst, (char*)ubuf, ulen);
}
else
sv_cat_retcvref(dst, cvref, newSVuv(uv));
}
}
else {
d = (U8*)SvPVX(dst);
for (p = s; p < e;) {
uv = ord_uv(p, e - p, &retlen);
if (retlen)
p += retlen;
else {
p++;
continue;
}
if (Is_VALID_UTF(uv))
d = app_uv(d, uv);
}
*d = '\0';
SvCUR_set(dst, d - (U8*)SvPVX(dst));
}
RETVAL = dst;
OUTPUT:
RETVAL
SV*
chr_unicode (uv)
UV uv
PROTOTYPE: $
ALIAS:
chr_utf16le = 1
chr_utf16be = 2
chr_utf32le = 3
chr_utf32be = 4
chr_utf8 = 5
chr_utf8mod = 6
chr_utfcp1047 = 7
PREINIT:
SV *dst;
U8 *u, ubuf[UTF8_MAXLEN + 1];
U8* (*app_uv)(U8*, UV);
CODE:
dst = newSVpvn("", 0);
(void)SvPOK_only(dst);
if (ix == 0) {
SvUTF8_on(dst);
}
app_uv = app_uv_in[ix];
u = app_uv(ubuf, uv);
if (u == ubuf)
XSRETURN_UNDEF;
sv_catpvn(dst, (char*)ubuf, u - ubuf);
RETVAL = dst;
OUTPUT:
RETVAL
SV*
ord_unicode (src)
SV* src
PROTOTYPE: $
ALIAS:
ord_utf16le = 1
ord_utf16be = 2
ord_utf32le = 3
ord_utf32be = 4
ord_utf8 = 5
ord_utf8mod = 6
ord_utfcp1047 = 7
PREINIT:
STRLEN srclen, retlen;
U8 *s;
UV uv;
UV (*ord_uv)(U8 *, STRLEN, STRLEN *);
CODE:
if (ix != 0 && SvUTF8(src)) {
src = sv_mortalcopy(src);
sv_utf8_downgrade(src, 0);
}
else if (ix == 0 && !SvUTF8(src)) {
src = sv_mortalcopy(src);
sv_utf8_upgrade(src);
}
s = (U8*)SvPV(src,srclen);
if (!srclen)
XSRETURN_UNDEF;
ord_uv = ord_uv_in[ix];
uv = ord_uv(s, srclen, &retlen);
RETVAL = retlen ? newSVuv(uv) : &PL_sv_undef;
OUTPUT:
RETVAL