Sponsoring The Perl Toolchain Summit 2025: Help make this important event another success Learn more

#
# Copyright (c) 2013-2020 Christian Jaeger, copying@christianjaeger.ch
#
# This is free software, offered under either the same terms as perl 5
# or the terms of the Artistic License version 2 or the terms of the
# MIT License (Expat version). See the file COPYING.md that came
# bundled with this file.
#
=head1 NAME
PXML::Serialize
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 SPECIAL VALUES
There are some special values that the serializer will evaluate
transparently:
=over 4
=item promises from FP::Lazy
are C<force>d
=item code references
are called with no arguments
=item objects
In body context, their `pxml_serialized_body_string` method (if
available) will be called, in attribute context,
`pxml_serialized_attribute_string`, in both cases the string is
inserted into the output without escaping (see `PXML::Preserialize`
for an example that uses this). Missing those, `string` will be called
if available and the result escaped, otherwise an exception is thrown.
=back
=head1 NOTE
This is alpha software! Read the status section in the package README
or on the L<website|http://functional-perl.org/>.
=cut
use strict;
use warnings FATAL => 'uninitialized';
use Exporter "import";
our @EXPORT = qw(pxml_xhtml_print);
our @EXPORT_OK = qw(pxml_print
pxml_print_fragment
pxml_xhtml_print_fast
pxml_print_fragment_fast
putxmlfile
puthtmlfile
attribute_escape
content_escape
);
our %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]);
use PXML qw(is_pxml_element is_pxmlflush);
use Chj::xperlfunc qw(xprint xprintln);
use FP::Weak 'weaken'; # instead of from Scalar::Util so that it can
# be turned off globally (and we depend on FP
# anyway)
use Scalar::Util qw(blessed refaddr);
sub is_somearray {
@_ == 1 or fp_croak_arity 1;
my $r = ref($_[0]);
# XX mess, make this a proper dependency
$r eq "ARRAY" or $r eq "PXML::Body"
}
sub is_empty_string {
@_ == 1 or fp_croak_arity 1;
defined $_[0] and !length ref $_[0] and $_[0] eq ""
}
my %attribute_escape
= ('&' => '&amp;', '<' => '&lt;', '>' => '&gt;', '"' => '&quot;');
sub attribute_escape {
my ($str) = @_;
return "" unless defined $str;
# XX or should attributes with undefined value be dropped? (Or,
# OTOH, should list processing be done?)
$str =~ s/([&<>"])/$attribute_escape{$1}/sg;
$str
}
my %content_escape = ('&' => '&amp;', '<' => '&lt;', '>' => '&gt;');
sub content_escape {
my ($str) = @_;
$str =~ s/([&<>])/$content_escape{$1}/sg;
$str
}
sub pxmlforce;
sub pxmlforce {
@_ == 1 or fp_croak_arity 1;
my ($v) = @_;
if (my $r = ref $v) {
if ($r eq "CODE") {
pxmlforce(&$r())
} else {
force $v
}
} else {
$v
}
}
sub object_force_escape {
@_ == 4 or fp_croak_arity 4;
my ($v, $string_method_for_context, $escape, $fh) = @_;
# $v is certain to be a reference (XX ehr, not even that?!), but
# not necessarily blessed.
if (defined blessed $v) {
if (my $m = $v->can($string_method_for_context)) {
# no escaping
return &$m($v, $fh);
} elsif (
$m =
# XX should this instead simply stringify using
# '"$v"'? That would not show up errors with
# context. But it would be less interruptive
# perhaps? Just issue a warning? Ideal would
# probably be to do the '""', but give a warning
# if it was Perl's default stringification. How to
# do this?
$v->can("string")
)
{
return &$escape(&$m($v));
}
}
die "unexpected type of reference that doesn't have a 'string' method: "
. (show $v);
}
# XXX hack, the code is really hopeless, should use ~same code as for
# body parts.
sub _attribute_val_to_string {
my ($v, $fh) = @_;
my $ref = ref $v;
if (length($ref)) {
if ($ref eq "ARRAY") {
join("", map { _attribute_val_to_string($_, $fh) } @$v)
} elsif (is_pxmlflush($v)) {
flush $fh or die $!;
""
} else {
object_force_escape(pxmlforce($v),
"pxml_serialized_attribute_string",
\&attribute_escape, $fh)
}
} else {
# fast path:
attribute_escape $v
}
}
sub _pxml_print_fragment_fast {
@_ == 4 or fp_croak_arity 4;
my ($v, $fh, $html5compat, $void_element_h) = @_;
weaken $_[0]
# necessary since we're also called with strings:
if ref $_[0];
LP: {
## **NOTE**: this has seen some evil optimizations; before
## working on the code, please undo them first by using git
## revert.
if (my $ref = ref $v) {
if (defined(my $class = blessed $v)) {
if (
$ref eq "PXML::Element"
or $ref eq "PXML::_::XHTML"
# ^ is this a worthwhile optimization?
or $v->isa("PXML::Element")
)
{
PXML:
my $n = $v->name;
print $fh "<$n" or die $!;
if (my $attrs = $v->maybe_attributes) {
for my $k (sort keys %$attrs) {
print $fh " $k=\"" or die $!;
my $str = _attribute_val_to_string $$attrs{$k}, $fh;
print $fh "$str\"" or die $!;
}
}
my $body = $v->body;
my $looksempty =
# fast path
(
not defined $body # XX allow undef or don't? Please
# finally settle this!
or (not ref $body and length($body) == 0)
or (
is_somearray($body) and (
not @$body or (
@$body == 1
and
( # XX remove undef check here now, too? OK?--nope, necessary
not defined $$body[0]
or (is_somearray($$body[0])
and not @{ $$body[0] })
or is_empty_string($$body[0])
)
)
)
)
);
my $selfreferential;
if ($html5compat) {
if ($$void_element_h{$n}) {
if ($looksempty) {
$selfreferential = 1;
} else {
my $isempty = # slow path
is_null(stream_mixed_flatten($body));
$selfreferential = $isempty;
warn "html5 compatible serialization requested "
. "but got void element '$n' that is not empty"
if not $isempty;
}
} else {
$selfreferential = 0;
}
} else {
$selfreferential = $looksempty;
}
if ($selfreferential) {
print $fh "/>" or die $!;
} else {
print $fh ">" or die $!;
no warnings "recursion"; # hu.
_pxml_print_fragment_fast($body, $fh, $html5compat,
$void_element_h);
print $fh "</$n>" or die $!;
}
} elsif (my $car_and_cdr = $v->can("car_and_cdr")) {
PAIR:
#my $a;
($a, $v) = &$car_and_cdr($v);
_pxml_print_fragment_fast($a, $fh, $html5compat,
$void_element_h);
#_pxml_print_fragment_fast (cdr $v, $fh);
redo LP;
} elsif (my $for_each = $v->can("for_each")) {
# catches null, too. Well.
&$for_each(
$v,
sub {
my ($a) = @_;
_pxml_print_fragment_fast($a, $fh, $html5compat,
$void_element_h);
}
);
} else {
my $v2 = force($v, 1);
# ^XX why pass nocache flag? (Was this to avoid
# memory retention issues?)
my $addr2 = refaddr($v2);
if (defined($addr2) and $addr2 != refaddr($v)) {
$v = $v2;
redo LP;
} elsif (is_somearray($v)) {
# COPY-PASTE. Really should refactor
# _pxml_print_fragment_fast into local hash-table
# based dispatcher.
no warnings "recursion"; # hu.
for (@$v) {
# XXX use Keep around $_ to prevent mutation of tree?
# nope, can't, will prevent streaming.
_pxml_print_fragment_fast($_, $fh, $html5compat,
$void_element_h);
}
} elsif (is_pxmlflush $v) {
flush $fh or die $!
} else {
# Fallback for references, XX copy-paste
print $fh object_force_escape($v,
"pxml_serialized_body_string", \&content_escape,
$fh)
or die $!;
}
}
} else {
if (is_somearray($v)) {
no warnings "recursion"; # hu.
for (@$v) {
# XXX use Keep around $_ to prevent mutation of tree?
# nope, can't, will prevent streaming.
_pxml_print_fragment_fast($_, $fh, $html5compat,
$void_element_h);
}
}
# 'force' doesn't evaluate CODE (probably rightly so),
# thus need to be explicit if we want 'late binding'
# (e.g. reference to dynamic variables) during
# serialization
elsif ($ref eq "CODE") {
$v = &$v();
redo LP;
} elsif (is_null $v) {
die "OBSOLETE?";
# end of linked list, nothing
# XX obsolete now, since $v->can("for_each") above
# will catch it already.
} else {
warn "XXX when does this happen?";
#use FP::Repl;
#repl;
# slow fallback... again, see above **NOTE** re
# evil.
$ref or die "BUG"; # we're in the if ref scope, right?
goto PXML if $v->isa("PXML::Element");
goto PAIR if is_pair $v;
# goto PROMISE if is_promise $v;
# Fallback for references, XX copy-paste
print $fh object_force_escape($v,
"pxml_serialized_body_string", \&content_escape, $fh)
or die $!;
}
}
} elsif (not defined $v) {
# (previously end of linked list marker) nothing; XX
# should this give exception (to point out any issue with
# deleted streams, the reason why I changed from using
# undef to null)? But exception won't show a good
# backtrace anyway at this point.
#warn "warning: ignoring undef in PXML datastructure";
# XXX what to do about this?
} else {
#print $fh content_escape($v) or die $!;
$v =~ s/([&<>])/$content_escape{$1}/sg;
print $fh $v or die $!;
}
}
}
sub pxml_print_fragment_fast {
@_ == 2 or fp_croak_arity 2;
my ($v, $fh) = @_;
weaken $_[0] if ref $_[0]; # ref check perhaps unnecessary here
my $no_element = sub {
@_ = ($v, $fh, undef, undef);
goto \&_pxml_print_fragment_fast;
};
my $with_first_element = sub {
my ($firstel) = @_;
weaken $_[0] if ref $_[0];
my $html5compat
= $firstel->require_printing_nonvoid_elements_nonselfreferential;
@_ = ($v, $fh, $html5compat,
($html5compat and $firstel->void_element_h));
goto \&_pxml_print_fragment_fast;
};
if (length(my $r = ref $v)) {
if (defined blessed $v and $v->isa("PXML::XHTML")) {
@_ = ($v);
goto &$with_first_element;
} else {
my $s = force(stream_mixed_flatten($v)->filter(\&is_pxml_element));
if (is_null $s) {
goto &$no_element
} else {
@_ = (car $s);
goto &$with_first_element;
}
}
} else {
goto &$no_element
}
}
sub pxml_xhtml_print_fast {
@_ >= 2 and @_ <= 3 or fp_croak_arity "2-3";
my ($v, $fh, $maybe_lang) = @_;
weaken $_[0] if ref $_[0]; # ref check perhaps unnecessary here
if (not ref $v or (defined(blessed $v) and not $v->isa("PXML::Element"))) {
die "not an element: " . (show $v);
}
if (not "html" eq $v->name) {
die "not an 'html' element: " . (show $v);
}
xprint($fh, "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");
xprint($fh,
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
);
# add attributes to toplevel element
my $v2 = $v->maybe_attributes ? $v : $v->attributes_set(
do {
my $lang = $maybe_lang
or die
"missing 'lang' attribute from html element and no lang option given";
+{
"xml:lang" => $lang,
lang => $lang
}
}
);
@_ = ($v2, $fh);
goto \&pxml_print_fragment_fast;
}
# for now,
sub pxml_xhtml_print;
*pxml_xhtml_print = \&pxml_xhtml_print_fast;
use Chj::xopen "xopen_write";
sub pxml_print {
@_ == 2 or fp_croak_arity 2;
my ($v, $fh) = @_;
weaken $_[0] if ref $_[0]; # ref check perhaps unnecessary here
xprintln($fh, q{<?xml version="1.0"?>});
pxml_print_fragment_fast($v, $fh);
}
sub putxmlfile {
@_ == 2 or fp_croak_arity 2;
my ($path, $xml) = @_;
weaken $_[1] if ref $_[0]; # ref check perhaps unnecessary here
my $f = xopen_write $path;
binmode($f, ":utf8") or die "binmode";
# ^ XX should this use ":encoding(UTF-8)"? To validate in-memory
# strings? Shouldn't we just check all *inputs*?
pxml_print($xml, $f);
$f->xclose;
}
sub PXML::Element::xmlfile {
my ($v, $path) = @_;
weaken $_[0];
putxmlfile($path, $v)
}
sub puthtmlfile {
@_ >= 2 and @_ <= 3 or fp_croak_arity "2-3";
my ($path, $v, $maybe_lang) = @_;
weaken $_[1] if ref $_[0]; # ref check perhaps unnecessary here
#xmkdir_p dirname $path;
my $out = xopen_write($path);
binmode $out, ":utf8" or die "binmode";
# ^ XX dito, see comment in putxmlfile
pxml_xhtml_print_fast($v, $out, $maybe_lang || "en");
$out->xclose;
}
sub PXML::Element::htmlfile {
my ($v, $path, $maybe_lang) = @_;
weaken $_[0];
puthtmlfile($path, $v, $maybe_lang)
}
1