use strict; use warnings; package Net::IMP::HTTP::Example::SaveResponse; use base 'Net::IMP::HTTP::Request'; use fields qw(root file); use Net::IMP; use Net::IMP::Debug; use File::Path 'make_path'; use File::Temp 'tempfile'; use Digest::MD5; use Carp; use Scalar::Util 'looks_like_number'; no if $] >= 5.017011, warnings => 'experimental::smartmatch'; my $DEFAULT_LIMIT = 10_000_000; sub RTYPES { (IMP_PREPASS) } sub new_factory { my ($class,%args) = @_; my $dir = $args{root} or croak("no root directory given"); -d $dir && -r _ && -x _ or croak("cannot use base dir $dir: $!"); $args{limit} = $DEFAULT_LIMIT if ! defined $args{limit}; return $class->SUPER::new_factory(%args); } sub new_analyzer { my ($factory,%args) = @_; my $self = $factory->SUPER::new_analyzer(%args); # we don't modify $self->run_callback( [ IMP_PREPASS,0,IMP_MAXOFFSET ], [ IMP_PREPASS,1,IMP_MAXOFFSET ] ); return $self; } sub DESTROY { my $self = shift; $self->{file} && $self->{file}{tname} && unlink($self->{file}{tname}); } sub request_hdr { my ($self,$hdr) = @_; my ($method,$proto,$host,$path) = $hdr =~m{\A([A-Z]+) +(?:(\w+)://([^/]+))?(\S+)}; $host = $1 if $hdr =~m{\nHost: *(\S+)}i; $host or goto IGNORE; $proto ||= 'http'; $host = lc($host); my $port = $host=~s{^(?:\[(\w._\-:)+\]|(\w._\-))(?::(\d+))?$}{ $1 || $2 }e ? $3:80; if ( my $rx = $self->{factory_args}{only_url} ) { goto IGNORE if "$proto://$host:$port$path" !~ $rx and "$proto://$host$path" !~ $rx } if ( my $rx = $self->{factory_args}{exclude_url} ) { goto IGNORE if "$proto://$host:$port$path" =~ $rx or "$proto://$host$path" =~ $rx } if ( my $srh = $self->{factory_args}{method} ) { goto IGNORE if ! _check_srh($srh,$method); } my $dir = $self->{factory_args}{root}."/$host:$port"; if ( ! -d $dir ) { my $err; make_path($dir, { error => \$err }); } my ($fh,$fname) = tempfile( "tmpXXXXXXX", DIR => $dir ) or goto IGNORE; $hdr =~s{^(Content-encoding:|Transfer-encoding:|Content-length:)}{X-Original-$1}mig; print $fh $hdr; my $qstring = $path =~s{\?(.+)}{} ? $1 : undef; $self->{file} = { tfh => $fh, tname => $fname, dir => $dir, method => $method, md5path => Digest::MD5->new->add($path)->hexdigest, md5data => undef, size => [ length($hdr),0,0,0 ], rphdr => '', rpbody => '', eof => 0, }; ( $self->{file}{md5data} = Digest::MD5->new )->add("\000$qstring\001") if defined $qstring and ! $self->{factory_args}{ignore_parameters}; return; # continue in request body IGNORE: # pass thru w/o saving debug("no save $host:$port/$path"); $self->run_callback( # pass thru everything [ IMP_PASS,0,IMP_MAXOFFSET ], [ IMP_PASS,1,IMP_MAXOFFSET ], ); } sub request_body { my ($self,$data) = @_; my $f = $self->{file} or return; print { $f->{tfh} } $data; my $md = $f->{md5data}; if ( $data ne '' ) { $f->{size}[1] += length($data); if ( my $l = $self->{factory_args}{limit} ) { return _stop_saving($self) if $f->{size}[1] > $l; } if ( ! $md ) { return if $self->{factory_args}{ignore_parameters}; $md = $f->{md5data} = Digest::MD5->new; } $md->add($data); return; } if ( defined( my $rp = $f->{rphdr} )) { print { $f->{tfh} } $rp; $f->{rphdr} = undef; if ( defined( $rp = $f->{rpbody} )) { print { $f->{tfh} } $rp; $f->{rpbody} = undef; } } _check_eof($self,1); } sub response_hdr { my ($self,$hdr) = @_; my $f = $self->{file} or return; return _stop_saving($self) if $hdr =~m{\AHTTP/1\.[01] (100|304|5\d\d)}; if ( my $srh = $self->{factory_args}{content_type} ) { my ($ct) = $hdr =~m{^Content-type:\s*([^\s;]+)}mi; $ct ||= 'application/octet-stream'; return _stop_saving($self) if ! _check_srh( $srh, lc($ct)); } $hdr =~s{^(Content-encoding:|Transfer-encoding:|Content-length:)}{X-Original-$1}mig; $f->{size}[2] = length($hdr); if ( defined $f->{rphdr} ) { # defer, request body not fully read $f->{rphdr} = $hdr; } else { print {$f->{tfh}} $hdr; } } sub response_body { my ($self,$data) = @_; my $f = $self->{file} or return; $f->{size}[3] += length($data); if ( my $l = $self->{factory_args}{limit} ) { return _stop_saving($self) if $f->{size}[3] > $l; } if ( $data eq '' ) { _check_eof($self,2) } elsif ( defined $f->{rpbody} ) { $f->{rpbody} .= $data; } else { print {$f->{tfh}} $data; } } sub _check_eof { my ($self,$bit) = @_; my $f = $self->{file} or return; ( $f->{eof} |= $bit ) == 3 or return; $self->{file} = undef; print {$f->{tfh}} pack("NNNN",@{ $f->{size} }); close($f->{tfh}); my $fname = "$f->{dir}/".join( "-", lc($f->{method}), $f->{md5path}, $f->{md5data} ? ($f->{md5data}->hexdigest):() ); rename($f->{tname}, $fname); } # will not be tracked sub any_data { my $self = shift; my $f = $self->{file} or return; unlink($f->{tname}); $self->{file} = undef; } ### config stuff ###### sub validate_cfg { my ($class,%cfg) = @_; my $dir = delete $cfg{root}; my @err; push @err, "no or non-existing root dir given" if ! defined $dir or ! -d $dir; if ( my $limit = delete $cfg{limit} ) { push @err, "limit should be number" if ! looks_like_number($limit) } for my $k (qw(content_type method)) { my $v = delete $cfg{$k} // next; push @err,"$k should be string, hash or regexp" if ref($v) and not ref($v) ~~ [ 'Regexp','HASH' ]; } for my $k (qw(exclude_url only_url)) { my $v = delete $cfg{$k} // next; push @err,"$k should be regexp" if ref($v) ne 'Regexp'; } delete $cfg{ignore_parameters}; push @err, $class->SUPER::validate_cfg(%cfg); return @err; } sub str2cfg { my $self = shift; my %cfg = $self->SUPER::str2cfg(@_); for my $k (qw(content_type method)) { my $v = $cfg{$k} // next; if ( $v =~m{^/(.*)/$}s ) { $cfg{$k} = eval { qr/$1/ } or croak("invalid regexp '$v': $@"); } elsif (( my @v = split( /,/,$v )) > 1 ) { $cfg{$k} = map { lc($_) => 1 } @v } else { $cfg{$k} = lc($v) } } for my $k (qw(exclude_url only_url)) { my $v = $cfg{$k} // next; $v =~m{^/(.*)/$}s or croak("$k should be /regex/"); $cfg{$k} = eval { qr/$1/ } or croak("invalid regexp '$v': $@"); } return %cfg; } sub _check_srh { my ($srh,$v) = @_; return $v =~ $srh if ref($srh) eq 'Regexp'; return $srh->{$_} if ref($srh) eq 'HASH'; return $srh eq $v; } sub _stop_saving { my $self = shift; my $f = $self->{file} or return; unlink($f->{tname}); $self->{file} = undef; $self->run_callback( [ IMP_PASS,0,IMP_MAXOFFSET ], [ IMP_PASS,1,IMP_MAXOFFSET ], ); } 1; __END__ =head1 NAME Net::IMP::HTTP::Example::SaveResponse - save response data to file system =head1 SYNOPSIS # use App::HTTP_Proxy_IMP to listen on 127.0.0.1:8000 and save all data # in myroot/ $ perl bin/imp_http_proxy --filter Example::SaveResponse=root=myroot 127.0.0.1:8000 =head1 DESCRIPTION This module is used to save response data into the file system. The module has the following arguments for C<new_analyzer>: =over 4 =item root The base directory for saving the data. This argument is required. =item content_type Limits saving of the response body the given content_types, either hash, string or regular expression. If not given everything can be saved. =item method Limits saving to the given methods, either hash, string or regular expression. If not given everything can be saved. =item limit No data will be saved, if the request or response body size is greater then the given limit. If not given a default of 10_000_000 will be assumed. For unlimited saving this can be set to 0. =item ignore_parameters If set the contents of the query string or the post data will not be used in creating the file name. =item exclude_url This regular expression describes, which URLs will not be saved. E.g. setting it to /\?/ causes no URLs with a query string to be saved. If not given everything can be saved. =item only_url If given, this regular expression limits saving to matching URLs. If not given everything can be saved. =back The module has a single argument C<root> for C<new_analyzer>. C<root> specifies the base directory, where the data get saved. The data are saved into a file C<root/host:port/method-md5path-md5data>, where =over 4 =item * method is the HTTP method, lower cased =item * md5path is the md5 hash of the path, e.g. excluding query string =item * md5data is the md5 hash over query string joined with post data =back The contents of the saved file consists of the HTTP request header and body, followed by the response header and body. Chunking and content-encoding is removed from the body.. To speedup extraction of each of these 4 parts from the file an index of 16 byte is added at the end of the file consisisting of 4 32bit unsigned integers in network byte order, describing the size of each part. =head1 AUTHOR Steffen Ullrich <sullr@cpan.org>