#! /usr/bin/perl -w # # lgrep v1.0: human-readable view of Apache webserver logs # Bret Victor, bret@ugcs.caltech.edu # http://www.ugcs.caltech.edu/~bret # # lgrep WORD [ CONDITIONS ] # searches today's server logs and returns the hits that contain WORD and # match the conditions. Normally, the hits are shown chronologically, # and are grouped by IP address. This works well for sparse, non- # overlapping user sessions or when you don't care much about the grouping. # To track multiple overlapping user sessions, see "groupip" below. # # A "condition" can be one of these condition words: # # nopics excludes all hits on image files # nobots excludes all hits by webbots # nome excludes all hits from your own ip address # good excludes all non-2xx (HTTP OK) hits # bad excludes all 2xx (HTTP OK) hits # user excludes all hits which are not in WORD's user directory. # This is useful if your username happens to be a common # substring in other people's files. # # or it can be one of these (non-conditional) option commands: # # ipname tries to reverse-DNS the IP addresses (can be slow) # nosort doesn't bother to sort the hits chronologically # groupip reorders the hits to group together all accesses from # the same IP address. The groups are ordered by the # time of their first hit (unless "nosort" is given) # nobsr supresses printing of the browser identification string # noref supresses printing of the referrer URL # noip supresses printing of the IP address # noinfo same as all three above commands together; just prints # specific hit data, with no group info # murmur supresses output for specific hits, but still prints # IP, referrer, and browser info for groups # quiet supresses all normal output # debug complains about malformed log lines instead of just # skipping them # # or it can be a Perl expression involving these variables: # # $ipaddr ip address string ("xxx.xxx.xxx.xxx") # $ipname hostname (will be equal to $ipaddr if "ipname" command # is not given, or if reverse-DNS fails) # $request HTTP request string # $page requested pathname, abbreviated # $user username whose directory the page is in, or "" if none # $browser web browser identification string # $referrer referring page URL # $search search string if referred by search engine, or "" if not # $response HTTP response code number # $seconds time, in seconds since epoch # $date date string ("mm/dd/yy") # $time time string ("hh/mm", 24-hour time) # $logline complete line in server log # $print assign to this variable to print output before the # hit info is printed (but after group info) # $println same as $print, but adds a newline # $end assign to this variable to print output right before the # program exits # # # Examples: # # # this is how I normally look at my traffic # lgrep bret nopics nobots ipname groupip # # # show only form submissions # lgrep bret '$request =~ /^POST/' # # # show only Mozilla users and don't show image files or bad hits # lgrep bret good nopics '$browser =~ /mozilla/i' # # # show only bad hits, and prefix each hit with the response code # lgrep bret bad '$print = " $response: "' # # # show all server activity at noon today # lgrep - '$time eq "12:00"' # # # design your own custom output format # lgrep bret ipname quiet '$println = "$ipname ($ipaddr): $date $time"' # # # just print a count of unique human visitors # lgrep bret user nobots quiet '${z}{$ipaddr}++; $end = keys %z' # # # see what sorts of perverted things people like to websearch # lgrep sex quiet '$println = $search' # # # Notes: # # If you are not running this on UGCS, change the # my @logs = ... # line to a list of the pathnames to your actual server logs, or # pipe your server logs in through STDIN. # # If you hook this up to a CGI script (or otherwise run it setuid), # be very VERY careful, since Perl conditions are eval'd directly. # Consider using the Safe module, or disallowing custom conditions # entirely. # # This program was written quickly and sorta off the top of my head, # so don't expect it to be bulletproof or even entirely correct. # Suggestions are welcome. # use Socket; use Time::Local; # set up useful variables my $myname = `whoami` || $ENV{USER} || ""; chomp $myname; my ($myip) = split ' ', $ENV{SSH_CLIENT} || "0.0.0.0"; my %months = (jan=>0, feb=>1, mar=>2, apr=>3, may=>4, jun=>5, jul=>6, aug=>7, sep=>8, oct=>9, nov=>10, dec=>11); my %ipcache; # parse command line options my $word = shift || die "usage: lgrep WORD [ CONDITIONS ]\n"; my %options; my @conditions = grep { s/\$(\w+)/\$_->{$1}/g or /\W/ or $options{$_}++ && 0 } @ARGV; # read raw data from server logs, or from STDIN if it's pipe-like my @logs = map "/ug/adm/log/www/$_/access_log", `ypmatch linuxclient netgroup` =~ /\w+/g if -t STDIN; my @lines = `grep -h $word @logs`; # parse log data and convert to a list of hashes # (The nasty "eval" business is because there is no way to short-circuit # a map block.) my @accesses = map { my $hash = eval { # parse log line my ($date, $ipaddr, $response, $request, $referrer, $browser) = /^ \[ (.*?) \] \s # date .+? \s # ??? 1590 .+? \s # ??? 1 (\S+) \s # ip address (\d+) .+? \s # HTTP response code .+? \s # ??? 84539 .+? \s # ??? - " (.*?) " \s # HTTP request (webpage) " (.*?) " \s # referrer URL " (.*?) " # browser name /x; die "What's up with this log line?\n$_\n" unless $ipaddr; # do the DNS or reverse-DNS, if desired my $ipname; if (!$options{ipname}) { $ipname = $ipaddr; } elsif ($ipaddr !~ /[^\d.]/) { $ipname = $ipcache{$ipaddr} ||= gethostbyaddr(inet_aton($ipaddr), AF_INET) || $ipaddr; } else { $ipname = $ipaddr; my $packedip = $ipcache{$ipname} ||= gethostbyname($ipname); $ipaddr = $packedip ? inet_ntoa($packedip) : "0.0.0.0"; } # parse the date string my ($mday, $mon, $year, $hour, $min, $sec) = ($date =~ m| (\d+) / (\w+) / (\d+) : (\d+) : (\d+) : (\d+) |x); die "What's up with the date in this log line?\n$_\n" unless $mday; $mon = $months{lc $mon}; my $seconds = timelocal($sec,$min,$hour,$mday,$mon,$year-1900); my $time = sprintf "%d:%02d", $hour, $min; $date = sprintf "%d/%d/%02d", $mon+1, $mday, $year%100; # abbreviate the target URL and figure out the target user my $page = uncode($request); my ($user) = ($page =~ m|^\w+ /~([\w\d]+)|); $user ||= ""; $page =~ s|^\w+ /|| and $page =~ s|^~$myname|~|o; $page =~ s| HTTP.*?$||; # figure out the search engine query string, if any $referrer = "" if length $referrer < 2; my $search = ""; my (undef, $query) = split /\?/, $referrer; if ($query) { my $key; for ($referrer) { $key = /yahoo/ && 'p' || /northernlight/ && 'qr' || /webcrawler/ && 'qkw' || /askjeeves/ && 'ask' || /aj\.com/ && 'ask' || 'q'; } if ($query =~ / (?:^|&) $key= ([^&]+) /x) { $search = uncode($1); $search =~ tr/+/ /; if ($search =~ /^cache:/) { $search = "" } } } $referrer = uncode($referrer); # construct the hash to represent this hit { ipaddr => $ipaddr, ipname => $ipname, request => $request, page => $page, user => $user, browser => $browser, referrer => $referrer, search => $search, response => $response, seconds => $seconds, date => $date, time => $time, logline => $_ } }; $@ ? ( $options{debug} ? die $@ : () ) : $hash; } @lines; # sort list chronologically by time and then alphabetically by URL unless ($options{nosort}) { @accesses = sort { $a->{seconds} <=> $b->{seconds} or $a->{page} cmp $b->{page} } @accesses; } # group list by IP address, if desired if ($options{groupip}) { my %seenip; @accesses = map { my $ipaddr = $_->{ipaddr}; $seenip{$ipaddr}++ ? () : grep { $_->{ipaddr} eq $ipaddr } @accesses; } @accesses; } # go through list, evaluating conditions and printing output my ($lastip, $lastref, $end) = ("","",""); ACCESS: foreach (@accesses) { # check preset condition words next if $options{good} and $_->{response} !~ /2/; next if $options{bad} and $_->{response} =~ /2/; next if $options{nome} and $_->{ipaddr} eq $myip; next if $options{user} and !$_->{user} || $_->{user} ne $word; next if $options{nopics} and $_->{page} =~ /\.gif$/i || $_->{page} =~ /\.jpg$/i || $_->{page} =~ /\.png$/i; next if $options{nobots} and $_->{browser} =~ /robot/i || $_->{browser} =~ /spider/i || $_->{browser} =~ /Googlebot/ || $_->{browser} =~ /Ask Jeeves/ || $_->{browser} =~ /WebCrawler/ || $_->{browser} =~ /ZyBorg/ || $_->{browser} =~ /Scooter/ || $_->{browser} =~ /Linkbot/ || $_->{browser} =~ /Slurp/ || $_->{browser} =~ /Gigabot/ || $_->{browser} =~ /Gulper/ || $_->{browser} =~ /SlySearch/ || $_->{browser} =~ /ia_archiver/ || $_->{browser} =~ /psbot/; # check custom Perl conditions $_->{print} = $_->{println} = $_->{end} = ""; for my $cond (@conditions) { my $result = eval $cond; die "bad condition: $cond\nerror: $@logline: $_->{logline}\n" if $@; next ACCESS unless $result; } if (length $_->{println}) { $_->{print} = $_->{println} . "\n" } if (length $_->{end}) { $end = $_->{end} } if ($options{quiet}) { print $_->{print}; next } # break off a new group if the IP address has changed from the last one if ($_->{ipname} ne $lastip) { $lastip = $_->{ipname}; $lastref = ""; # print IP address unless ($options{noip} or $options{noinfo}) { print "\n$_->{ipname}"; print " ($_->{ipaddr})" if $_->{ipaddr} ne $_->{ipname}; print ":\n"; } # print browser info unless ($options{nobsr} or $options{noinfo} or length $_->{browser} < 2) { print "bsr: " . wrap($_->{browser}, 5) . "\n"; } } # print referrer if it's interesting if (!$options{noref} and !$options{noinfo} and $_->{referrer} ne '-' and $_->{referrer} ne $lastref) { if (!$_->{user} or index($_->{referrer}, "/~$_->{user}") == -1) { $lastref = $_->{referrer}; (my $ref = $_->{referrer}) =~ s|^http://||; print "ref: " . wrap($ref, 5) . "\n" if length $_->{referrer} > 2; } } # print specific hit info print $_->{print}; print " " x (6 - length $_->{time}), "$_->{time}: ", wrap($_->{page}, 8), "\n" unless $options{murmur}; } # print last words and exit print "$end\n"; exit; # wraps and indents a line that would go off of an 80-character screen sub wrap { my ($string, $indent) = @_; if (length $string > 80-$indent) { $string = substr($string, 0, 79-$indent) . "\n" . " " x $indent . wrap(substr($string, 79-$indent), $indent); } return $string; } # uncodes embedded hex characters ("%7Ebret" to "~bret") sub uncode { my ($string) = @_; $string =~ s/%([0-9A-Fa-f]{2})/((oct("0x$1")>31) ? chr(oct("0x$1")):'')/eg; return $string; }