our
@EXPORT_OK
=
qw'encode_entities decode_entities'
;
my
%entity2char
= (
'&'
=>
'&'
,
'>'
=>
'>'
,
'<'
=>
'<'
,
'"'
=>
'"'
,
'''
=>
"'"
,
'Æ'
=>
chr
(198), # capital AE diphthong (ligature)
'Á'
=>
chr
(193), # capital A, acute accent
'Â'
=>
chr
(194), # capital A, circumflex accent
'À'
=>
chr
(192), # capital A, grave accent
'Å'
=>
chr
(197), # capital A, ring
'Ã'
=>
chr
(195), # capital A, tilde
'Ä'
=>
chr
(196), # capital A, dieresis or umlaut mark
'Ç'
=>
chr
(199), # capital C, cedilla
'Ð'
=>
chr
(208), # capital Eth, Icelandic
'É'
=>
chr
(201), # capital E, acute accent
'Ê'
=>
chr
(202), # capital E, circumflex accent
'È'
=>
chr
(200), # capital E, grave accent
'Ë'
=>
chr
(203), # capital E, dieresis or umlaut mark
'Í'
=>
chr
(205), # capital I, acute accent
'Î'
=>
chr
(206), # capital I, circumflex accent
'Ì'
=>
chr
(204), # capital I, grave accent
'Ï'
=>
chr
(207), # capital I, dieresis or umlaut mark
'Ñ'
=>
chr
(209), # capital N, tilde
'Ó'
=>
chr
(211), # capital O, acute accent
'Ô'
=>
chr
(212), # capital O, circumflex accent
'Ò'
=>
chr
(210), # capital O, grave accent
'Ø'
=>
chr
(216), # capital O, slash
'Õ'
=>
chr
(213), # capital O, tilde
'Ö'
=>
chr
(214), # capital O, dieresis or umlaut mark
'Þ'
=>
chr
(222), # capital THORN, Icelandic
'Ú'
=>
chr
(218), # capital U, acute accent
'Û'
=>
chr
(219), # capital U, circumflex accent
'Ù'
=>
chr
(217), # capital U, grave accent
'Ü'
=>
chr
(220), # capital U, dieresis or umlaut mark
'Ý'
=>
chr
(221), # capital Y, acute accent
'á'
=>
chr
(225), # small a, acute accent
'â'
=>
chr
(226), # small a, circumflex accent
'æ'
=>
chr
(230), # small ae diphthong (ligature)
'à'
=>
chr
(224), # small a, grave accent
'å'
=>
chr
(229), # small a, ring
'ã'
=>
chr
(227), # small a, tilde
'ä'
=>
chr
(228), # small a, dieresis or umlaut mark
'ç'
=>
chr
(231), # small c, cedilla
'é'
=>
chr
(233), # small e, acute accent
'ê'
=>
chr
(234), # small e, circumflex accent
'è'
=>
chr
(232), # small e, grave accent
'ð'
=>
chr
(240), # small eth, Icelandic
'ë'
=>
chr
(235), # small e, dieresis or umlaut mark
'í'
=>
chr
(237), # small i, acute accent
'î'
=>
chr
(238), # small i, circumflex accent
'ì'
=>
chr
(236), # small i, grave accent
'ï'
=>
chr
(239), # small i, dieresis or umlaut mark
'ñ'
=>
chr
(241), # small n, tilde
'ó'
=>
chr
(243), # small o, acute accent
'ô'
=>
chr
(244), # small o, circumflex accent
'ò'
=>
chr
(242), # small o, grave accent
'ø'
=>
chr
(248), # small o, slash
'õ'
=>
chr
(245), # small o, tilde
'ö'
=>
chr
(246), # small o, dieresis or umlaut mark
'ß'
=>
chr
(223), # small sharp s, German (sz ligature)
'þ'
=>
chr
(254), # small thorn, Icelandic
'ú'
=>
chr
(250), # small u, acute accent
'û'
=>
chr
(251), # small u, circumflex accent
'ù'
=>
chr
(249), # small u, grave accent
'ü'
=>
chr
(252), # small u, dieresis or umlaut mark
'ý'
=>
chr
(253), # small y, acute accent
'ÿ'
=>
chr
(255), # small y, dieresis or umlaut mark
'©'
=>
chr
(169), # copyright sign
'®'
=>
chr
(174), # registered sign
' '
=>
chr
(160), # non breaking space
'¡'
=>
chr
(161),
'¢'
=>
chr
(162),
'£'
=>
chr
(163),
'¤'
=>
chr
(164),
'¥'
=>
chr
(165),
'¦'
=>
chr
(166),
'§'
=>
chr
(167),
'¨'
=>
chr
(168),
'ª'
=>
chr
(170),
'«'
=>
chr
(171),
'¬'
=>
chr
(172),
'­'
=>
chr
(173),
'¯'
=>
chr
(175),
'°'
=>
chr
(176),
'±'
=>
chr
(177),
'¹'
=>
chr
(185),
'²'
=>
chr
(178),
'³'
=>
chr
(179),
'´'
=>
chr
(180),
'µ'
=>
chr
(181),
'¶'
=>
chr
(182),
'·'
=>
chr
(183),
'¸'
=>
chr
(184),
'º'
=>
chr
(186),
'»'
=>
chr
(187),
'¼'
=>
chr
(188),
'½'
=>
chr
(189),
'¾'
=>
chr
(190),
'¿'
=>
chr
(191),
'×'
=>
chr
(215),
'÷'
=>
chr
(247),
'Œ'
=>
chr
(338),
'œ'
=>
chr
(339),
'Š'
=>
chr
(352),
'š'
=>
chr
(353),
'Ÿ'
=>
chr
(376),
'ƒ'
=>
chr
(402),
'ˆ'
=>
chr
(710),
'˜'
=>
chr
(732),
'Α'
=>
chr
(913),
'Β'
=>
chr
(914),
'Γ'
=>
chr
(915),
'Δ'
=>
chr
(916),
'Ε'
=>
chr
(917),
'Ζ'
=>
chr
(918),
'Η'
=>
chr
(919),
'Θ'
=>
chr
(920),
'Ι'
=>
chr
(921),
'Κ'
=>
chr
(922),
'Λ'
=>
chr
(923),
'Μ'
=>
chr
(924),
'Ν'
=>
chr
(925),
'Ξ'
=>
chr
(926),
'Ο'
=>
chr
(927),
'Π'
=>
chr
(928),
'Ρ'
=>
chr
(929),
'Σ'
=>
chr
(931),
'Τ'
=>
chr
(932),
'Υ'
=>
chr
(933),
'Φ'
=>
chr
(934),
'Χ'
=>
chr
(935),
'Ψ'
=>
chr
(936),
'Ω'
=>
chr
(937),
'α'
=>
chr
(945),
'β'
=>
chr
(946),
'γ'
=>
chr
(947),
'δ'
=>
chr
(948),
'ε'
=>
chr
(949),
'ζ'
=>
chr
(950),
'η'
=>
chr
(951),
'θ'
=>
chr
(952),
'ι'
=>
chr
(953),
'κ'
=>
chr
(954),
'λ'
=>
chr
(955),
'μ'
=>
chr
(956),
'ν'
=>
chr
(957),
'ξ'
=>
chr
(958),
'ο'
=>
chr
(959),
'π'
=>
chr
(960),
'ρ'
=>
chr
(961),
'ς'
=>
chr
(962),
'σ'
=>
chr
(963),
'τ'
=>
chr
(964),
'υ'
=>
chr
(965),
'φ'
=>
chr
(966),
'χ'
=>
chr
(967),
'ψ'
=>
chr
(968),
'ω'
=>
chr
(969),
'ϑ'
=>
chr
(977),
'ϒ'
=>
chr
(978),
'ϖ'
=>
chr
(982),
' '
=>
chr
(8194),
' '
=>
chr
(8195),
' '
=>
chr
(8201),
'‌'
=>
chr
(8204),
'‍'
=>
chr
(8205),
'‎'
=>
chr
(8206),
'‏'
=>
chr
(8207),
'–'
=>
chr
(8211),
'—'
=>
chr
(8212),
'‘'
=>
chr
(8216),
'’'
=>
chr
(8217),
'‚'
=>
chr
(8218),
'“'
=>
chr
(8220),
'”'
=>
chr
(8221),
'„'
=>
chr
(8222),
'†'
=>
chr
(8224),
'‡'
=>
chr
(8225),
'•'
=>
chr
(8226),
'…'
=>
chr
(8230),
'‰'
=>
chr
(8240),
'′'
=>
chr
(8242),
'″'
=>
chr
(8243),
'‹'
=>
chr
(8249),
'›'
=>
chr
(8250),
'‾'
=>
chr
(8254),
'⁄'
=>
chr
(8260),
'€'
=>
chr
(8364),
'ℑ'
=>
chr
(8465),
'℘'
=>
chr
(8472),
'ℜ'
=>
chr
(8476),
'™'
=>
chr
(8482),
'ℵ'
=>
chr
(8501),
'←'
=>
chr
(8592),
'↑'
=>
chr
(8593),
'→'
=>
chr
(8594),
'↓'
=>
chr
(8595),
'↔'
=>
chr
(8596),
'↵'
=>
chr
(8629),
'⇐'
=>
chr
(8656),
'⇑'
=>
chr
(8657),
'⇒'
=>
chr
(8658),
'⇓'
=>
chr
(8659),
'⇔'
=>
chr
(8660),
'∀'
=>
chr
(8704),
'∂'
=>
chr
(8706),
'∃'
=>
chr
(8707),
'∅'
=>
chr
(8709),
'∇'
=>
chr
(8711),
'∈'
=>
chr
(8712),
'∉'
=>
chr
(8713),
'∋'
=>
chr
(8715),
'∏'
=>
chr
(8719),
'∑'
=>
chr
(8721),
'−'
=>
chr
(8722),
'∗'
=>
chr
(8727),
'√'
=>
chr
(8730),
'∝'
=>
chr
(8733),
'∞'
=>
chr
(8734),
'∠'
=>
chr
(8736),
'∧'
=>
chr
(8743),
'∨'
=>
chr
(8744),
'∩'
=>
chr
(8745),
'∪'
=>
chr
(8746),
'∫'
=>
chr
(8747),
'∴'
=>
chr
(8756),
'∼'
=>
chr
(8764),
'≅'
=>
chr
(8773),
'≈'
=>
chr
(8776),
'≠'
=>
chr
(8800),
'≡'
=>
chr
(8801),
'≤'
=>
chr
(8804),
'≥'
=>
chr
(8805),
'⊂'
=>
chr
(8834),
'⊃'
=>
chr
(8835),
'⊄'
=>
chr
(8836),
'⊆'
=>
chr
(8838),
'⊇'
=>
chr
(8839),
'⊕'
=>
chr
(8853),
'⊗'
=>
chr
(8855),
'⊥'
=>
chr
(8869),
'⋅'
=>
chr
(8901),
'⌈'
=>
chr
(8968),
'⌉'
=>
chr
(8969),
'⌊'
=>
chr
(8970),
'⌋'
=>
chr
(8971),
'〈'
=>
chr
(9001),
'〉'
=>
chr
(9002),
'◊'
=>
chr
(9674),
'♠'
=>
chr
(9824),
'♣'
=>
chr
(9827),
'♥'
=>
chr
(9829),
'♦'
=>
chr
(9830),
);
my
%char2entity
=
map
{
$entity2char
{
$_
} =>
$_
}
keys
%entity2char
;
sub
encode_entities {
my
(
$string
,
$chars
) =
@_
;
my
%escape
;
if
(
$chars
) {
%escape
=
map
{
$_
=> 1 }
split
//,
$chars
;
}
else
{
%escape
=
map
{
$_
=> 1 }
keys
%char2entity
;
}
my
@string
=
split
//,
$string
;
for
my
$i
( 0 ..
$#string
) {
my
$c
=
$string
[
$i
];
if
(
$escape
{
$c
} and
$c
ne
'&'
and
$c
ne
';'
) {
$string
[
$i
] =
$char2entity
{
$c
};
}
elsif
(
$escape
{
$c
} and
$c
eq
'&'
) {
my
$maybe_entity
=
''
;
FIND_SEMI:
for
my
$j
(
$i
..
$#string
) {
$maybe_entity
.=
$string
[
$j
];
last
FIND_SEMI
if
$string
[
$j
] eq
';'
;
}
if
( not
exists
$entity2char
{
$maybe_entity
} ) {
$string
[
$i
] =
$char2entity
{
$c
};
}
}
elsif
(
$escape
{
$c
} and
$c
eq
';'
) {
my
$maybe_entity
=
''
;
FIND_AMP:
for
(
my
$j
=
$i
;
$j
>= 0;
$j
-- ) {
$maybe_entity
=
$string
[
$j
] .
$maybe_entity
;
last
FIND_SEMI
if
$string
[
$j
] eq
'&'
;
}
if
( not
exists
$entity2char
{
$maybe_entity
} ) {
$string
[
$i
] =
$char2entity
{
$c
};
}
}
}
return
join
''
,
@string
;
}
sub
decode_entities {
my
@results
;
for
my
$string
(
@_
) {
my
@string
=
split
//,
$string
;
for
my
$i
( 0 ..
$#string
) {
my
$c
=
$string
[
$i
];
if
(
$c
eq
'&'
) {
my
$maybe_entity
=
''
;
my
$length
= 0;
FIND_SEMI:
for
my
$j
(
$i
..
$#string
) {
$maybe_entity
.=
$string
[
$j
];
last
FIND_SEMI
if
$string
[
$j
] eq
';'
;
$length
++;
}
if
(
exists
$entity2char
{
$maybe_entity
} ) {
$string
[
$i
] =
$entity2char
{
$maybe_entity
};
splice
(
@string
,
$i
+ 1,
$length
);
}
}
}
push
@results
,
join
''
,
@string
;
}
return
wantarray
?
@results
:
$results
[0];
}
1;