/* $Id$ * * Copyright 1997-1999, Gisle Aas. * * This library is free software; you can redistribute it and/or * modify it under the same terms as Perl itself. */ #ifdef __cplusplus extern "C" { #endif #include "EXTERN.h" #include "perl.h" #include "XSUB.h" #ifdef __cplusplus } #endif #include "patchlevel.h" #if PATCHLEVEL <= 4 && !defined(PL_dowarn) #define PL_dowarn dowarn #endif #ifdef G_WARN_ON #define DOWARN (PL_dowarn & G_WARN_ON) #else #define DOWARN PL_dowarn #endif MODULE = Unicode::String PACKAGE = Unicode::String PROTOTYPES: DISABLE SV* latin1(self,...) SV* self PREINIT: SV* newsv; SV* str; CODE: RETVAL = 0; if (!sv_isobject(self)) { newsv = self; RETVAL = self = newSV(0); newSVrv(self, "Unicode::String"); } else if (items > 1) { newsv = ST(1); } else { newsv = 0; } str = SvRV(self); if (GIMME_V != G_VOID && !RETVAL) { U8 *beg, *s; STRLEN len; U16* usp = (U16*)SvPV(str,len); len /= 2; RETVAL = newSV(len+1); SvPOK_on(RETVAL); beg = s = (U8*)SvPVX(RETVAL); while (len--) { U16 us = ntohs(*usp++); if (us > 255) { if (us == 0xFEFF) { /* ignore BYTE ORDER MARK */ } else { if (DOWARN) warn("Data outside latin1 range (pos=%d, ch=U+%x)", s - beg, us); } } else { *s++ = us; } } SvCUR_set(RETVAL, s - beg); *s='\0'; } if (newsv) { U16 *usp; STRLEN len; STRLEN my_na; U8 *s = (U8*)SvPV(newsv, len); SvGROW(str, len*2 + 2); SvPOK_on(str); SvCUR_set(str,len*2); usp = (U16*)SvPV(str,my_na); while (len--) { *usp++ = htons((U16)*s++); } *usp = 0; } if (!RETVAL) RETVAL = newSViv(0); OUTPUT: RETVAL SV* ucs4(self,...) SV* self PREINIT: SV* newsv; SV* str; CODE: RETVAL = 0; if (!sv_isobject(self)) { newsv = self; RETVAL = self = newSV(0); newSVrv(self, "Unicode::String"); } else if (items > 1) { newsv = ST(1); } else { newsv = 0; } str = SvRV(self); if (GIMME_V != G_VOID && !RETVAL) { U32* to, *beg; STRLEN len; /* source length */ U16* from = (U16*)SvPV(str, len); STRLEN my_na; len /= 2; RETVAL = newSV(len*4 + 1); SvPOK_on(RETVAL); beg = to = (U32*)SvPV(RETVAL, my_na); while (len--) { U16 us = ntohs(*from++); if (us >= 0xD800 && us <= 0xDFFF) { /* surrogate */ U16 low = len ? ntohs(*from) : 0; if (us >= 0xDC00 || low < 0xDC00 || low > 0xDFFF) { /* bad surrogate pair */ if (DOWARN) warn("Bad surrogate pair U+%04x U+%04x", us, low); } else { len--; from++; *to++ = htonl((us-0xD800)*0x400 + low-0xDC00 + 0x10000); } } else { *to++ = htonl(us); } } SvCUR_set(RETVAL, (to - beg) * 4); SvPVX(RETVAL)[SvCUR(RETVAL)] = '\0'; } if (newsv) { STRLEN len; U32* from = (U32*)SvPV(newsv, len); len /= 4; SvGROW(str, len*2 + 1); /* enough if we don't need surrogates */ SvPOK_on(str); SvCUR_set(str, 0); while (len--) { U32 uc = ntohl(*from++); /* XXX should look for swapped FEFF */ if (uc > 0xFFFF) { if (uc > 0x10FFFF) { /* can't be represented */ if (DOWARN) warn("UCS4 char (0x%08x) can not be encoded as UTF16", uc); } else { /* generate two surrogates */ U16 high, low; uc -= 0x10000; high = htons(uc/0x400 + 0xD800); low = htons(uc%0x400 + 0xDC00); sv_catpvn(str, (char*)&high, 2); sv_catpvn(str, (char*)&low, 2); } } else { U16 s = htons(uc); sv_catpvn(str, (char*)&s, 2); } } /* ensure '\0' termination of string */ SvGROW(str, SvCUR(str)+1); SvPVX(str)[SvCUR(str)] = '\0'; } if (!RETVAL) RETVAL = newSViv(0); OUTPUT: RETVAL SV* utf8(self,...) SV* self PREINIT: SV* newsv; SV* str; CODE: RETVAL = 0; if (!sv_isobject(self)) { newsv = self; RETVAL = self = newSV(0); newSVrv(self, "Unicode::String"); } else if (items > 1) { newsv = ST(1); } else { newsv = 0; } str = SvRV(self); if (GIMME_V != G_VOID && !RETVAL) { /* encode str */ STRLEN len; U16* from = (U16*)SvPV(str, len); len /= 2; RETVAL = newSV(len*1.2 + 1); /* guess osuitable for euro-text */ SvPOK_on(RETVAL); SvCUR_set(RETVAL, 0); while (len--) { register U32 us = ntohs(*from++); if (us >= 0xD800 && us <= 0xDFFF) { /* surrogate */ U16 low = len ? ntohs(*from) : 0; if (us >= 0xDC00 || low < 0xDC00 || low > 0xDFFF) { /* bad surrogate pair */ if (DOWARN) warn("Bad surrogate pair U+%04x U+%04x", us, low); } else { len--; from++; us = (us-0xD800)*0x400 + low-0xDC00 + 0x10000; } } if (us < 0x80) { U8 c = us; sv_catpvn(RETVAL, (char*)&c, 1); } else if (us < 0x800) { U8 c[2]; c[1] = (us & 0077) | 0200; c[0] = (us >> 6) | 0300; sv_catpvn(RETVAL, (char*)c, 2); } else if (us < 0x10000) { U8 c[3]; c[2] = (us & 0077) | 0200; us >>= 6; c[1] = (us & 0077) | 0200; us >>= 6; c[0] = us | 0340; sv_catpvn(RETVAL, (char*)c, 3); } else if (us < 0x200000) { U8 c[4]; c[3] = (us & 0077) | 0200; us >>= 6; c[2] = (us & 0077) | 0200; us >>= 6; c[1] = (us & 0077) | 0200; us >>= 6; c[0] = us | 0360; sv_catpvn(RETVAL, (char*)c, 4); } else { /* this can't really happen since we start with utf16 */ if (DOWARN) warn("Large char (%08X) ignored", us); } } /* ensure '\0' termination of string */ SvGROW(str, SvCUR(str)+1); SvPVX(str)[SvCUR(str)] = '\0'; } if (newsv) { /* decode new */ STRLEN len; U8* from = (U8*)SvPV(newsv, len); SvGROW(str, len + 1); /* must be at least this big */ SvPOK_on(str); SvCUR_set(str, 0); while (len--) { U8 s[2]; U8 u = *from++; if (u < 0x80) { s[0] = '\0'; s[1] = u; sv_catpvn(str, (char*)s, 2); } else if ((u & 0340) == 0300) { /* 2 bytes to decode */ if (!len) { if (DOWARN) warn("Missing second byte of utf8 encoded char"); } else { U8 u2 = *from; if ((u2 & 0300) != 0200) { if (DOWARN) warn("Bad second byte of utf8 encoded char"); } else { from++; len--; /* consume it */ s[0] = (u & 0037) >> 2; s[1] = ((u & 0003) << 6) | (u2 & 0077); sv_catpvn(str, (char*)s, 2); } } } else if ((u & 0360) == 0340) { /* 3 bytes to decode */ if (len < 2) { if (DOWARN) warn("Missing 2nd or 3rd byte of utf8 encoded char"); } else { U8 u2 = from[0]; U8 u3 = from[1]; if ((u2 & 0300) != 0200 || (u3 & 0300) != 0200) { if (DOWARN) warn("Bad 2nd or 3rd byte of utf8 encoded char"); } else { from += 2; len -= 2; /* consume them */ s[0] = (u << 4) | (u2 & 0077) >> 2; s[1] = (u2 << 6) | (u3 & 0077); sv_catpvn(str, (char*)s, 2); } } } else if ((u & 0370) == 0360) { /* 4 bytes to decode, encoded using surrogates */ if (len < 3) { if (DOWARN) warn("Missing 2nd, 3rd or 4th byte of utf8 encoded char"); } else { if ((from[0] & 0300) != 0200 || (from[1] & 0300) != 0200 || (from[2] & 0300) != 0200) { if (DOWARN) warn("Bad 2nd, 3rd or 4th byte of utf8 encoded char"); } else { U32 c = (u & 0007) << 6; c |= (from[0] & 0077); c <<= 6; c |= (from[1] & 0077); c <<= 6; c |= (from[2] & 0077); from += 3; len -= 3; /* c must now be encoded as two surrogates */ if (c > 0x10FFFF) { if (DOWARN) warn("Can't represent 0x%08X as utf16", c); } else { /* generate two surrogates */ U16 high, low; c -= 0x10000; high = htons(c/0x400 + 0xD800); low = htons(c%0x400 + 0xDC00); sv_catpvn(str, (char*)&high, 2); sv_catpvn(str, (char*)&low, 2); } } } } else if ((u & 0374) == 0370) { /* 5 bytes to decode, can't happend */ if (DOWARN) warn("Can't represent 5 byte encoded chars"); } else { if (DOWARN) warn("Bad utf8 byte (0x%02X) ignored", u); } } } if (!RETVAL) RETVAL = newSViv(0); OUTPUT: RETVAL void byteswap2(...) ALIAS: Unicode::String::byteswap2 = 2 Unicode::String::byteswap4 = 4 PREINIT: int i; char c; STRLEN len; char* str; PPCODE: for (i = 0; i < items; i++) { SV* sv = ST(i); STRLEN len; char* src = SvPV(sv, len); char* dest; if (GIMME_V != G_VOID) { SV* dest_sv = sv_2mortal(newSV(len+1)); SvCUR_set(dest_sv, len); *SvEND(dest_sv) = 0; SvPOK_on(dest_sv); PUSHs(dest_sv); dest = SvPVX(dest_sv); } else { if (SvREADONLY(sv)) { die("byteswap argument #%d is readonly", i+1); continue; /* probably not */ } dest = src; } if (ix == 2) { while (len >= 2) { char tmp = *src++; *dest++ = *src++; *dest++ = tmp; len -= 2; } } else { /* ix == 4 */ while (len >= 4) { char tmp1 = *src++; char tmp2 = *src++; *dest++ = src[1]; *dest++ = src[0]; src += 2; *dest++ = tmp2; *dest++ = tmp1; len -= 4; } } if (len) { if (DOWARN) warn("byteswap argument #%d not long enough", i+1); /* this will be a no-op unless dest/src are different */ while (len--) *dest++ = *src++; } }