#!/usr/bin/perl -w

$CHILDREN = 40;        # Number of children to spawn
$TIMEOUT  = 30;        # DNS timeout
$BUFFER   = 60000;     # Maximum number of log lines to keep in memory
$FLUSH    = 3000;      # Flush output buffer every $FLUSH_LINES lines
$STATUS   = 0;         # Display status message every $STATUS lines
$TTL      = 86400 * 7; # Seconds until disk cached ips are expired
$DEBUG    = 1;

# ip2host v0.06 - Resolve IPs to hostnames in web server logs 
# Maurice Aubrey <maurice@hevanet.com>
#
# Usage: ip2host [OPTIONS] [cache_file] < infile > outfile   
#
# $Id: ip2host,v 1.4 2001/01/30 11:16:49 maurice Exp $
#
# CHANGES:
#
#   0.06 Tue Jan 30 02:56:18 PST 2001
#        - Added command-line options
#        - Place restriction on number of log lines kept
#          in memory
#        - Add DB_File subclass which understands how to
#          expire IPs
#        - Cache IPs in memory when read from disk cache
#          (substantial speed improvement)
#        - No longer using IO::Select module  
#   
#   0.05 Fri Apr 14 05:31:38 PDT 2000
#        - Add POD to allow inclusion in CPAN
#
#   0.04 Mon Nov 22 17:54:07 PST 1999
#        - Check socketpair() return value
#        - Updated documentation
# 
#   0.03 Thu Nov 18 16:57:53 PST 1999 
#        - Renamed $BUFFER to $FLUSH
#        - Improved documentation 
#
#   0.02 Sat Oct 16 00:05:29 PDT 1999
#        - Initial public release
#
# Benchmark comparison between ip2host and logresolve.pl:
#
# [maurice@foo maurice]$ wc -l access_log
#  200000 access_log
#
# [maurice@foo maurice]$ time ./ip2host --timeout=5 < access_log >/dev/null
# 26.11user 1.41system 0:36.13elapsed 76%CPU (0avgtext+0avgdata 0maxresident)k
# 0inputs+0outputs (280major+5748minor)pagefaults 0swaps
#
# [maurice@foo maurice]$ time ./logresolve.pl < access_log > /dev/null
# 19.01user 1.48system 9:10.98elapsed 3%CPU (0avgtext+0avgdata 0maxresident)k
# 0inputs+0outputs (291major+1379minor)pagefaults 0swaps

use strict;
use vars qw( 
  $CHILDREN $TIMEOUT $BUFFER $FLUSH $STATUS $TTL $DEBUG $VERSION %Cache %Opt
);
use Socket;
use Symbol ();
use Getopt::Long ();

$VERSION = '0.06';

{
  package DB_File::ip2host;

  use strict;
  use vars qw( @ISA $DEBUG $TTL %Cache );
  use Carp;

  $DEBUG = 1;
  @ISA = qw( DB_File );

  # Delay loading of DB_File module until this package is
  # actually needed.
  sub init {
    my %args = @_;

    $args{ttl} or croak "no ttl specified";

    $TTL = $args{ttl};
    require DB_File;
  }
  
  # In order to implement EXISTS, we'd need to parse
  # the value to see if the ip has expired, which is just
  # as expensive as FETCH.  So we'll just make sure we
  # never use it.
  sub EXISTS { croak "exists not implemented!"; }

  sub FETCH {
    my $self = shift;
    my $ip   = shift;

    return $Cache{ $ip } if exists $Cache{ $ip };

    my $val = $self->SUPER::FETCH( $ip );
    defined $val or return $Cache{ $ip } = undef;

    my($utc, $host) = split /:/, $val, 2;
    time - $utc < $TTL or return $Cache{ $ip } = undef;
    
    return $Cache{ $ip } = $host;
  }

  sub STORE {
    my $self = shift;
    my($ip, $host) = @_;

    return $host if defined $Cache{ $ip };
    $self->SUPER::STORE( $ip => (time . ':' . $host) );
    $Cache{ $ip } = $host;
  }

}

{
  my $last_msg = ''; # closure to remember last msg displayed
  sub status {
    my $msg = shift;

    my $gap = length($last_msg) - length($msg);
    $last_msg = $msg;
    join '', "\r", $msg, ($gap > 0 ? " " x $gap : '');
  }
}            

sub usage {
  my $exit = shift || 0;

  print STDERR <<EOF;
$0 version $VERSION

Usage: $0 [OPTIONS] [cache_file] < input_log > output_log

  infile          Web server log file.  Any log format is acceptable,
                  as long as each line begins with the remote client's
                  IP address.

  outfile         Same as input file, but with all of the IPs resolved
                  to hostnames.   

  Options:

  --help          Display this help and exit
  --children=...  Number of child processes to spawn (default: $CHILDREN)
  --timeout=...   Seconds to wait on DNS response (default: $TIMEOUT) 
  --buffer=...    Maximum number of log lines to keep in
                  memory (default: $BUFFER)
  --flush=...     Number of lines to process before flushing
                  output buffer (default: $FLUSH) 
  --status=...    Display status message every N lines (default: none)
  --cache=...     Filename to use as disk cache (default: none)
  --ttl=...       Number of seconds before IP cached on disk is expired
                  (default: $TTL)

See the POD for more details:

  perldoc $0

Copyright 1999-2001, Maurice Aubrey <maurice\@hevanet.com>.
All rights reserved.

This module is free software; you may redistribute it and/or
modify it under the same terms as Perl itself.      
EOF

  exit $exit;
}

sub child_resolver {
  my($fh, $timeout) = @_;

  $SIG{'ALRM'} = sub { die 'alarmed' }; 
  while(defined(my $ip = <$fh>)) { # Get IP to resolve
    chomp($ip);
    my $host = undef;
    eval { # Try to resolve, but give up after $TIMEOUT seconds
      alarm( $timeout );
      my $ip_struct = inet_aton $ip;
      $host = gethostbyaddr $ip_struct, AF_INET;
      alarm(0);
    };
    # XXX Debug
    if ($DEBUG and $@ =~ /alarm/) {
      $host ||= 'TIMEOUT';
      # print STDERR "Alarming ($ip)...\n";
    }
    $host ||= $ip;
    print $fh "$ip $host\n";
  }
}

%Opt = (
  children => $CHILDREN,
  timeout  => $TIMEOUT,
  buffer   => $BUFFER,  
  flush    => $FLUSH,
  status   => $STATUS,
  cache    => undef,
  ttl      => $TTL,
);
Getopt::Long::GetOptions(\%Opt, 
  "children|kids=i", 
  "timeout=i",
  "buffer=i",
  "flush=i",
  "status=i",
  "ttl=i",
  "cache=s",
  "usage|help|version",
);
usage(0) if $Opt{usage};
usage(1) if @ARGV > 1;
$Opt{cache} = shift @ARGV if @ARGV;  

# Spawn the children
my %kids = (
  wtr_vec    => '',
  rdr_vec    => '',
  fh         => {},
  max_fileno => 0,
  min_fileno => undef,
);
for(my $child = 1; $child <= $Opt{children}; $child++) {
  my($child_fh, $parent_fh) = (Symbol::gensym, Symbol::gensym);
  socketpair($child_fh, $parent_fh, AF_UNIX, SOCK_STREAM, PF_UNSPEC)
    or die "$0 socketpair failed: $!";
  select $child_fh;  $| = 1;
  select $parent_fh; $| = 1;
  select STDOUT;

  if (my $pid = fork) {
    close $parent_fh;
    my $child_fileno = fileno( $child_fh );
    $kids{fh}{ $child_fileno } = $child_fh;
    vec($kids{wtr_vec}, $child_fileno, 1) = 1; # start out writing to all
    $kids{min_fileno} = $child_fileno unless defined $kids{min_fileno};
    $kids{max_fileno} = $child_fileno;
  } else { # Child starts here
    defined $pid or die "$0 fork failed: $!";
    close $child_fh; close STDIN; close STDOUT; 
    child_resolver( $parent_fh, $Opt{timeout} );
    exit 0;
  }
}

if ($Opt{cache}) { # Cache results to disk if asked
  DB_File::ip2host::init( ttl => $Opt{ttl} );
  tie %Cache, 'DB_File::ip2host', $Opt{cache}
    or die "$0 unable to tie '$Opt{cache}': $!";
}              

my $lineno      = 0;
my $next_line   = 1;
my $is_eof      = 0;
my $buffer_used = 0;
my $queries     = 0; 
my %output;
my %pending;

# Write as many lines as we can until we come across one
# that's missing (that means it's still pending DNS).
sub flush_output {

  for (; exists $output{ $next_line }; $next_line++) {
    print delete $output{ $next_line };
    --$buffer_used;
  }
}      

sub myselect {
  # If buffer is full, only wait on responses
  my $wtr_vec = ($buffer_used >= $Opt{buffer} ? undef : $kids{wtr_vec});

  my($rout, $wout);
  select($rout = $kids{rdr_vec}, $wout = $wtr_vec, undef, undef);

  my @readable;
  if ($rout) {
    for(my $i = $kids{min_fileno}; $i <= $kids{max_fileno}; $i++) {
      next unless vec($rout, $i, 1);

      # move from read to write set
      vec($kids{rdr_vec}, $i, 1) = 0;
      vec($kids{wtr_vec}, $i, 1) = 1 unless $is_eof;
      push @readable, $kids{fh}{ $i };
    }
  }

  my @writeable;
  if ($wout) {
    for(my $i = $kids{min_fileno}; $i <= $kids{max_fileno}; $i++) {
      next unless vec($wout, $i, 1);       

      # move from write to read set
      vec($kids{wtr_vec}, $i, 1) = 0;
      vec($kids{rdr_vec}, $i, 1) = 1;
      push @writeable, $kids{fh}{ $i };
    }
  }

  (\@readable, \@writeable);
}                     

sub show_status {
  print STDERR status(join ' ',
    "line=$lineno",
    "pending=" . scalar keys %pending,
    "queries=$queries",
    # "scalar keys" is very expensive against a DB file, so we
    # call it directly on its in-memory store.
    "ip_cache=" . ($Opt{cache} ? scalar keys %DB_File::ip2host::Cache 
                               : scalar keys %Cache),
    "buffer=$buffer_used",
  );
}

while(1) {
  my($readable, $writeable) = myselect;

  # One or more children ready for an IP
  while (@$writeable and $buffer_used < $Opt{buffer}) { 
    my $line = <STDIN>;
    $is_eof = 1, last unless defined $line;
    ++$lineno;
    my($ip, $rest) = split / /, $line, 2;
    if (my $cached_ip = $Cache{ $ip }) { # We found this answer already 
      $output{ $lineno } = "$cached_ip $rest";
    } elsif (exists $pending{ $ip }) { # We're still looking
      push @{ $pending{ $ip } }, [ $lineno, $rest ];
    } else { # Send IP to child
      my $write_fh = shift @$writeable;
      print $write_fh "$ip\n";
      $pending{ $ip } = [ [ $lineno, $rest ] ];
      $queries++;
    }
    $buffer_used++;
    flush_output if exists $output{ $next_line } and
      ($buffer_used >= $Opt{buffer} or $lineno % $Opt{flush} == 0);
    show_status if $Opt{status} and $lineno % $Opt{status} == 0; 
  }

  while (@$readable) { # One or more children have an answer
    my $read_fh = shift @$readable;
    my $str = <$read_fh>;
    defined $str or next;
    chomp($str);
    my($ip, $host) = split / /, $str, 2;
    $Cache{ $ip } = $host;
    # Take all the lines that were pending for this IP and
    # toss them into the output buffer
    foreach my $pending (@{ $pending{ $ip } }) {
      $output{ $pending->[0] } = "$host $pending->[1]";
    }
    delete $pending{ $ip };
    flush_output if exists $output{ $next_line } and
      ($buffer_used >= $Opt{buffer} or $lineno % $Opt{flush} == 0);
    show_status if $is_eof and $Opt{status};
  }                        

  last if $is_eof and not keys %pending;
}

flush_output;
print STDERR "\n" if $Opt{status};

=pod

=head1 NAME

  ip2host - Resolve IPs to hostnames in web server logs

=head1 SYNOPSIS

  ip2host [OPTIONS] [cache_file] < infile > outfile

  infile  - Web server log file.  Any log format is acceptable, 
            as long as each line begins with the remote client's 
            IP address.

  outfile - Same as input file, but with all of the IPs resolved 
            to hostnames.        

=head1 DESCRIPTION

This script is a drop-in replacement for the logresolve.pl
script distributed with the Apache web server.

ip2host has the same basic design of forking children to handle
the DNS resolution in parallel, but multiplexes the communication
to minimize the impact of slow responses.  This results in a
significant speed improvement (approximately 10x faster), and 
the performance degrades more gracefully as the DNS timeout value
is increased.

For a description of the command-line options type:

  ./ip2host --help 

This script is reported to work under Linux, FreeBSD, Solaris,
Tru64, and IRIX.  

=head1 AUTHOR 

Maurice Aubrey E<lt>maurice@hevanet.comE<gt>

=head1 COPYRIGHT

Copyright 1999-2001, Maurice Aubrey E<lt>maurice@hevanet.comE<gt>.
All rights reserved.

This module is free software; you may redistribute it and/or
modify it under the same terms as Perl itself.

=head1 README

Drop-in replacement for the logresolve.pl script distributed
with the Apache web server that's approximately 10x faster.

=head1 SCRIPT CATEGORIES

Web        

=cut