#!/usr/bin/perl -w $CHILDREN = 40; # Number of children to spawn $TIMEOUT = 30; # DNS timeout $BUFFER = 60000; # Maximum number of log lines to keep in memory $FLUSH = 3000; # Flush output buffer every $FLUSH_LINES lines $STATUS = 0; # Display status message every $STATUS lines $TTL = 86400 * 7; # Seconds until disk cached ips are expired $DEBUG = 1; # ip2host v0.06 - Resolve IPs to hostnames in web server logs # Maurice Aubrey # # Usage: ip2host [OPTIONS] [cache_file] < infile > outfile # # $Id: ip2host,v 1.4 2001/01/30 11:16:49 maurice Exp $ # # CHANGES: # # 0.06 Tue Jan 30 02:56:18 PST 2001 # - Added command-line options # - Place restriction on number of log lines kept # in memory # - Add DB_File subclass which understands how to # expire IPs # - Cache IPs in memory when read from disk cache # (substantial speed improvement) # - No longer using IO::Select module # # 0.05 Fri Apr 14 05:31:38 PDT 2000 # - Add POD to allow inclusion in CPAN # # 0.04 Mon Nov 22 17:54:07 PST 1999 # - Check socketpair() return value # - Updated documentation # # 0.03 Thu Nov 18 16:57:53 PST 1999 # - Renamed $BUFFER to $FLUSH # - Improved documentation # # 0.02 Sat Oct 16 00:05:29 PDT 1999 # - Initial public release # # Benchmark comparison between ip2host and logresolve.pl: # # [maurice@foo maurice]$ wc -l access_log # 200000 access_log # # [maurice@foo maurice]$ time ./ip2host --timeout=5 < access_log >/dev/null # 26.11user 1.41system 0:36.13elapsed 76%CPU (0avgtext+0avgdata 0maxresident)k # 0inputs+0outputs (280major+5748minor)pagefaults 0swaps # # [maurice@foo maurice]$ time ./logresolve.pl < access_log > /dev/null # 19.01user 1.48system 9:10.98elapsed 3%CPU (0avgtext+0avgdata 0maxresident)k # 0inputs+0outputs (291major+1379minor)pagefaults 0swaps use strict; use vars qw( $CHILDREN $TIMEOUT $BUFFER $FLUSH $STATUS $TTL $DEBUG $VERSION %Cache %Opt ); use Socket; use Symbol (); use Getopt::Long (); $VERSION = '0.06'; { package DB_File::ip2host; use strict; use vars qw( @ISA $DEBUG $TTL %Cache ); use Carp; $DEBUG = 1; @ISA = qw( DB_File ); # Delay loading of DB_File module until this package is # actually needed. sub init { my %args = @_; $args{ttl} or croak "no ttl specified"; $TTL = $args{ttl}; require DB_File; } # In order to implement EXISTS, we'd need to parse # the value to see if the ip has expired, which is just # as expensive as FETCH. So we'll just make sure we # never use it. sub EXISTS { croak "exists not implemented!"; } sub FETCH { my $self = shift; my $ip = shift; return $Cache{ $ip } if exists $Cache{ $ip }; my $val = $self->SUPER::FETCH( $ip ); defined $val or return $Cache{ $ip } = undef; my($utc, $host) = split /:/, $val, 2; time - $utc < $TTL or return $Cache{ $ip } = undef; return $Cache{ $ip } = $host; } sub STORE { my $self = shift; my($ip, $host) = @_; return $host if defined $Cache{ $ip }; $self->SUPER::STORE( $ip => (time . ':' . $host) ); $Cache{ $ip } = $host; } } { my $last_msg = ''; # closure to remember last msg displayed sub status { my $msg = shift; my $gap = length($last_msg) - length($msg); $last_msg = $msg; join '', "\r", $msg, ($gap > 0 ? " " x $gap : ''); } } sub usage { my $exit = shift || 0; print STDERR < output_log infile Web server log file. Any log format is acceptable, as long as each line begins with the remote client's IP address. outfile Same as input file, but with all of the IPs resolved to hostnames. Options: --help Display this help and exit --children=... Number of child processes to spawn (default: $CHILDREN) --timeout=... Seconds to wait on DNS response (default: $TIMEOUT) --buffer=... Maximum number of log lines to keep in memory (default: $BUFFER) --flush=... Number of lines to process before flushing output buffer (default: $FLUSH) --status=... Display status message every N lines (default: none) --cache=... Filename to use as disk cache (default: none) --ttl=... Number of seconds before IP cached on disk is expired (default: $TTL) See the POD for more details: perldoc $0 Copyright 1999-2001, Maurice Aubrey . All rights reserved. This module is free software; you may redistribute it and/or modify it under the same terms as Perl itself. EOF exit $exit; } sub child_resolver { my($fh, $timeout) = @_; $SIG{'ALRM'} = sub { die 'alarmed' }; while(defined(my $ip = <$fh>)) { # Get IP to resolve chomp($ip); my $host = undef; eval { # Try to resolve, but give up after $TIMEOUT seconds alarm( $timeout ); my $ip_struct = inet_aton $ip; $host = gethostbyaddr $ip_struct, AF_INET; alarm(0); }; # XXX Debug if ($DEBUG and $@ =~ /alarm/) { $host ||= 'TIMEOUT'; # print STDERR "Alarming ($ip)...\n"; } $host ||= $ip; print $fh "$ip $host\n"; } } %Opt = ( children => $CHILDREN, timeout => $TIMEOUT, buffer => $BUFFER, flush => $FLUSH, status => $STATUS, cache => undef, ttl => $TTL, ); Getopt::Long::GetOptions(\%Opt, "children|kids=i", "timeout=i", "buffer=i", "flush=i", "status=i", "ttl=i", "cache=s", "usage|help|version", ); usage(0) if $Opt{usage}; usage(1) if @ARGV > 1; $Opt{cache} = shift @ARGV if @ARGV; # Spawn the children my %kids = ( wtr_vec => '', rdr_vec => '', fh => {}, max_fileno => 0, min_fileno => undef, ); for(my $child = 1; $child <= $Opt{children}; $child++) { my($child_fh, $parent_fh) = (Symbol::gensym, Symbol::gensym); socketpair($child_fh, $parent_fh, AF_UNIX, SOCK_STREAM, PF_UNSPEC) or die "$0 socketpair failed: $!"; select $child_fh; $| = 1; select $parent_fh; $| = 1; select STDOUT; if (my $pid = fork) { close $parent_fh; my $child_fileno = fileno( $child_fh ); $kids{fh}{ $child_fileno } = $child_fh; vec($kids{wtr_vec}, $child_fileno, 1) = 1; # start out writing to all $kids{min_fileno} = $child_fileno unless defined $kids{min_fileno}; $kids{max_fileno} = $child_fileno; } else { # Child starts here defined $pid or die "$0 fork failed: $!"; close $child_fh; close STDIN; close STDOUT; child_resolver( $parent_fh, $Opt{timeout} ); exit 0; } } if ($Opt{cache}) { # Cache results to disk if asked DB_File::ip2host::init( ttl => $Opt{ttl} ); tie %Cache, 'DB_File::ip2host', $Opt{cache} or die "$0 unable to tie '$Opt{cache}': $!"; } my $lineno = 0; my $next_line = 1; my $is_eof = 0; my $buffer_used = 0; my $queries = 0; my %output; my %pending; # Write as many lines as we can until we come across one # that's missing (that means it's still pending DNS). sub flush_output { for (; exists $output{ $next_line }; $next_line++) { print delete $output{ $next_line }; --$buffer_used; } } sub myselect { # If buffer is full, only wait on responses my $wtr_vec = ($buffer_used >= $Opt{buffer} ? undef : $kids{wtr_vec}); my($rout, $wout); select($rout = $kids{rdr_vec}, $wout = $wtr_vec, undef, undef); my @readable; if ($rout) { for(my $i = $kids{min_fileno}; $i <= $kids{max_fileno}; $i++) { next unless vec($rout, $i, 1); # move from read to write set vec($kids{rdr_vec}, $i, 1) = 0; vec($kids{wtr_vec}, $i, 1) = 1 unless $is_eof; push @readable, $kids{fh}{ $i }; } } my @writeable; if ($wout) { for(my $i = $kids{min_fileno}; $i <= $kids{max_fileno}; $i++) { next unless vec($wout, $i, 1); # move from write to read set vec($kids{wtr_vec}, $i, 1) = 0; vec($kids{rdr_vec}, $i, 1) = 1; push @writeable, $kids{fh}{ $i }; } } (\@readable, \@writeable); } sub show_status { print STDERR status(join ' ', "line=$lineno", "pending=" . scalar keys %pending, "queries=$queries", # "scalar keys" is very expensive against a DB file, so we # call it directly on its in-memory store. "ip_cache=" . ($Opt{cache} ? scalar keys %DB_File::ip2host::Cache : scalar keys %Cache), "buffer=$buffer_used", ); } while(1) { my($readable, $writeable) = myselect; # One or more children ready for an IP while (@$writeable and $buffer_used < $Opt{buffer}) { my $line = ; $is_eof = 1, last unless defined $line; ++$lineno; my($ip, $rest) = split / /, $line, 2; if (my $cached_ip = $Cache{ $ip }) { # We found this answer already $output{ $lineno } = "$cached_ip $rest"; } elsif (exists $pending{ $ip }) { # We're still looking push @{ $pending{ $ip } }, [ $lineno, $rest ]; } else { # Send IP to child my $write_fh = shift @$writeable; print $write_fh "$ip\n"; $pending{ $ip } = [ [ $lineno, $rest ] ]; $queries++; } $buffer_used++; flush_output if exists $output{ $next_line } and ($buffer_used >= $Opt{buffer} or $lineno % $Opt{flush} == 0); show_status if $Opt{status} and $lineno % $Opt{status} == 0; } while (@$readable) { # One or more children have an answer my $read_fh = shift @$readable; my $str = <$read_fh>; defined $str or next; chomp($str); my($ip, $host) = split / /, $str, 2; $Cache{ $ip } = $host; # Take all the lines that were pending for this IP and # toss them into the output buffer foreach my $pending (@{ $pending{ $ip } }) { $output{ $pending->[0] } = "$host $pending->[1]"; } delete $pending{ $ip }; flush_output if exists $output{ $next_line } and ($buffer_used >= $Opt{buffer} or $lineno % $Opt{flush} == 0); show_status if $is_eof and $Opt{status}; } last if $is_eof and not keys %pending; } flush_output; print STDERR "\n" if $Opt{status}; =pod =head1 NAME ip2host - Resolve IPs to hostnames in web server logs =head1 SYNOPSIS ip2host [OPTIONS] [cache_file] < infile > outfile infile - Web server log file. Any log format is acceptable, as long as each line begins with the remote client's IP address. outfile - Same as input file, but with all of the IPs resolved to hostnames. =head1 DESCRIPTION This script is a drop-in replacement for the logresolve.pl script distributed with the Apache web server. ip2host has the same basic design of forking children to handle the DNS resolution in parallel, but multiplexes the communication to minimize the impact of slow responses. This results in a significant speed improvement (approximately 10x faster), and the performance degrades more gracefully as the DNS timeout value is increased. For a description of the command-line options type: ./ip2host --help This script is reported to work under Linux, FreeBSD, Solaris, Tru64, and IRIX. =head1 AUTHOR Maurice Aubrey Emaurice@hevanet.comE =head1 COPYRIGHT Copyright 1999-2001, Maurice Aubrey Emaurice@hevanet.comE. All rights reserved. This module is free software; you may redistribute it and/or modify it under the same terms as Perl itself. =head1 README Drop-in replacement for the logresolve.pl script distributed with the Apache web server that's approximately 10x faster. =head1 SCRIPT CATEGORIES Web =cut