use strict;
use warnings;
package Net::IMP::HTTP::Example::SaveResponse;
use base 'Net::IMP::HTTP::Request';
use fields qw(root file);

use Net::IMP;
use Net::IMP::Debug;
use File::Path 'make_path';
use File::Temp 'tempfile';
use Digest::MD5;
use Carp;
use Scalar::Util 'looks_like_number';
no if $] >= 5.017011, warnings => 'experimental::smartmatch';

my $DEFAULT_LIMIT = 10_000_000;

sub RTYPES { (IMP_PREPASS) }

sub new_factory {
    my ($class,%args) = @_;
    my $dir = $args{root} or croak("no root directory given");
    -d $dir && -r _ && -x _ or croak("cannot use base dir $dir: $!");
    $args{limit} = $DEFAULT_LIMIT if ! defined $args{limit};
    return $class->SUPER::new_factory(%args);
}

sub new_analyzer {
    my ($factory,%args) = @_;
    my $self = $factory->SUPER::new_analyzer(%args);
    # we don't modify
    $self->run_callback(
	[ IMP_PREPASS,0,IMP_MAXOFFSET ],
	[ IMP_PREPASS,1,IMP_MAXOFFSET ]
    );
    return $self;
}


sub DESTROY {
    my $self = shift;
    $self->{file} && $self->{file}{tname} && unlink($self->{file}{tname});
}

sub request_hdr {
    my ($self,$hdr) = @_;

    my ($method,$proto,$host,$path) = $hdr =~m{\A([A-Z]+) +(?:(\w+)://([^/]+))?(\S+)};
    $host = $1 if $hdr =~m{\nHost: *(\S+)}i;
    $host or goto IGNORE;
    $proto ||= 'http';
    $host = lc($host);
    my $port = 
	$host=~s{^(?:\[(\w._\-:)+\]|(\w._\-))(?::(\d+))?$}{ $1 || $2 }e ? 
	$3:80;

    if ( my $rx = $self->{factory_args}{only_url} ) {
	goto IGNORE if "$proto://$host:$port$path" !~ $rx
	    and "$proto://$host$path" !~ $rx
    }
    if ( my $rx = $self->{factory_args}{exclude_url} ) {
	goto IGNORE if "$proto://$host:$port$path" =~ $rx
	    or "$proto://$host$path" =~ $rx
    }
    if ( my $srh = $self->{factory_args}{method} ) {
	goto IGNORE if ! _check_srh($srh,$method);
    }
    

    my $dir = $self->{factory_args}{root}."/$host:$port";
    if ( ! -d $dir ) {
	my $err;
	make_path($dir, { error => \$err  });
    }
    my ($fh,$fname) = tempfile( "tmpXXXXXXX", DIR => $dir )
	or goto IGNORE;

    $hdr =~s{^(Content-encoding:|Transfer-encoding:|Content-length:)}{X-Original-$1}mig;
    print $fh $hdr;

    my $qstring = $path =~s{\?(.+)}{} ? $1 : undef;
    $self->{file} = {
	tfh => $fh,
	tname => $fname,
	dir => $dir,
	method => $method,
	md5path => Digest::MD5->new->add($path)->hexdigest,
	md5data => undef,
	size => [ length($hdr),0,0,0 ],
	rphdr => '',
	rpbody => '',
	eof => 0,
    };
    ( $self->{file}{md5data} = Digest::MD5->new )->add("\000$qstring\001")
	if defined $qstring and ! $self->{factory_args}{ignore_parameters};
    return; # continue in request body

    IGNORE:
    # pass thru w/o saving
    debug("no save $host:$port/$path");
    $self->run_callback( 
	# pass thru everything 
	[ IMP_PASS,0,IMP_MAXOFFSET ], 
	[ IMP_PASS,1,IMP_MAXOFFSET ], 
    );
}

sub request_body {
    my ($self,$data) = @_;
    my $f = $self->{file} or return;
    print { $f->{tfh} } $data;
    my $md = $f->{md5data};
    if ( $data ne '' ) {
	$f->{size}[1] += length($data);
	if ( my $l = $self->{factory_args}{limit} ) {
	    return _stop_saving($self) if $f->{size}[1] > $l;
	}
	if ( ! $md ) {
	    return if $self->{factory_args}{ignore_parameters};
	    $md = $f->{md5data} = Digest::MD5->new;
	}
	$md->add($data);
	return;
    }
    if ( defined( my $rp = $f->{rphdr} )) {
	print { $f->{tfh} } $rp;
	$f->{rphdr} = undef;
	if ( defined( $rp = $f->{rpbody} )) {
	    print { $f->{tfh} } $rp;
	    $f->{rpbody} = undef;
	}
    }
    _check_eof($self,1);
}

sub response_hdr {
    my ($self,$hdr) = @_;
    my $f = $self->{file} or return;
    return _stop_saving($self) if $hdr =~m{\AHTTP/1\.[01] (100|304|5\d\d)};
    if ( my $srh = $self->{factory_args}{content_type} ) {
	my ($ct) = $hdr =~m{^Content-type:\s*([^\s;]+)}mi;
	$ct ||= 'application/octet-stream';
	return _stop_saving($self) if ! _check_srh( $srh, lc($ct));
    }
    $hdr =~s{^(Content-encoding:|Transfer-encoding:|Content-length:)}{X-Original-$1}mig;
    $f->{size}[2] = length($hdr);
    if ( defined $f->{rphdr} ) {
	# defer, request body not fully read
	$f->{rphdr} = $hdr;
    } else {
	print {$f->{tfh}} $hdr;
    }
}

sub response_body {
    my ($self,$data) = @_;
    my $f = $self->{file} or return;
    $f->{size}[3] += length($data);
    if ( my $l = $self->{factory_args}{limit} ) {
	return _stop_saving($self) if $f->{size}[3] > $l;
    }
    if ( $data eq '' ) {
	_check_eof($self,2)
    } elsif ( defined $f->{rpbody} ) {
	$f->{rpbody} .= $data;
    } else {
	print {$f->{tfh}} $data;
    }
}

sub _check_eof {
    my ($self,$bit) = @_;
    my $f = $self->{file} or return;
    ( $f->{eof} |= $bit ) == 3 or return;
    $self->{file} = undef;
    print {$f->{tfh}} pack("NNNN",@{ $f->{size} });
    close($f->{tfh});
    my $fname = "$f->{dir}/".join( "-",
	lc($f->{method}),
	$f->{md5path},
	$f->{md5data} ? ($f->{md5data}->hexdigest):()
    );
    rename($f->{tname}, $fname);
}

# will not be tracked
sub any_data {
    my $self = shift;
    my $f = $self->{file} or return;
    unlink($f->{tname});
    $self->{file} = undef;
}

### config stuff ######
sub validate_cfg {
    my ($class,%cfg) = @_;
    my $dir = delete $cfg{root};
    my @err;
    push @err, "no or non-existing root dir given" 
	if ! defined $dir or ! -d $dir;
    if ( my $limit = delete $cfg{limit} ) {
	push @err, "limit should be number" if ! looks_like_number($limit)
    }
    for my $k (qw(content_type method)) {
	my $v = delete $cfg{$k} // next;
	push @err,"$k should be string, hash or regexp" if 
	    ref($v) and not ref($v) ~~ [ 'Regexp','HASH' ];
    }
    for my $k (qw(exclude_url only_url)) {
	my $v = delete $cfg{$k} // next;
	push @err,"$k should be regexp" if ref($v) ne 'Regexp';
    }
    delete $cfg{ignore_parameters};

    push @err, $class->SUPER::validate_cfg(%cfg);
    return @err;
}

sub str2cfg {
    my $self = shift;
    my %cfg = $self->SUPER::str2cfg(@_);
    for my $k (qw(content_type method)) {
	my $v = $cfg{$k} // next;
	if ( $v =~m{^/(.*)/$}s ) {
	    $cfg{$k} = eval { qr/$1/ } or croak("invalid regexp '$v': $@");
	} elsif (( my @v = split( /,/,$v )) > 1 ) {
	    $cfg{$k} = map { lc($_) => 1 } @v 
	} else {
	    $cfg{$k} = lc($v)
	}
    }
    for my $k (qw(exclude_url only_url)) {
	my $v = $cfg{$k} // next;
	$v =~m{^/(.*)/$}s or croak("$k should be /regex/");
	$cfg{$k} = eval { qr/$1/ } or croak("invalid regexp '$v': $@");
    }
    return %cfg;
}

sub _check_srh {
    my ($srh,$v) = @_;
    return $v =~ $srh if ref($srh) eq 'Regexp';
    return $srh->{$_} if ref($srh) eq 'HASH';
    return $srh eq $v;
}

sub _stop_saving {
    my $self = shift;
    my $f = $self->{file} or return;
    unlink($f->{tname});
    $self->{file} = undef;
    $self->run_callback(
	[ IMP_PASS,0,IMP_MAXOFFSET ],
	[ IMP_PASS,1,IMP_MAXOFFSET ],
    );
}


1;

__END__

=head1 NAME 

Net::IMP::HTTP::Example::SaveResponse - save response data to file system

=head1 SYNOPSIS

  # use App::HTTP_Proxy_IMP to listen on 127.0.0.1:8000 and save all data 
  # in myroot/
  $ perl bin/imp_http_proxy --filter Example::SaveResponse=root=myroot 127.0.0.1:8000


=head1 DESCRIPTION

This module is used to save response data into the file system.

The module has the following arguments for C<new_analyzer>:

=over 4

=item root

The base directory for saving the data. This argument is required.

=item content_type

Limits saving of the response body the given content_types, either hash,
string or regular expression.
If not given everything can be saved.

=item method

Limits saving to the given methods, either hash, string or regular
expression.
If not given everything can be saved.

=item limit

No data will be saved, if the request or response body size is greater then
the given limit.
If not given a default of 10_000_000 will be assumed. 
For unlimited saving this can be set to 0.

=item ignore_parameters

If set the contents of the query string or the post data will not be used in
creating the file name.

=item exclude_url

This regular expression describes, which URLs will not be saved.
E.g. setting it to /\?/ causes no URLs with a query string to be saved.
If not given everything can be saved.

=item only_url

If given, this regular expression limits saving to matching URLs.
If not given everything can be saved.

=back

The module has a single argument C<root> for C<new_analyzer>. 
C<root> specifies the base directory, where the data get saved.
The data are saved into a file C<root/host:port/method-md5path-md5data>, where

=over 4

=item * method is the HTTP method, lower cased

=item * md5path is the md5 hash of the path, e.g. excluding query string

=item * md5data is the md5 hash over query string joined with post data

=back

The contents of the saved file consists of the HTTP request header and body,
followed by the response header and body.
Chunking and content-encoding is removed from the body..
To speedup extraction of each of these 4 parts from the file an index of 16
byte is added at the end of the file consisisting of 4 32bit unsigned
integers in network byte order, describing the size of each part.

=head1 AUTHOR

Steffen Ullrich <sullr@cpan.org>