#! /usr/bin/perl -w 
#
# lgrep v1.0:  human-readable view of Apache webserver logs
#              Bret Victor, bret@ugcs.caltech.edu
#              http://www.ugcs.caltech.edu/~bret
#
# lgrep WORD [ CONDITIONS ]
#   searches today's server logs and returns the hits that contain WORD and 
#   match the conditions.  Normally, the hits are shown chronologically,
#   and are grouped by IP address.  This works well for sparse, non-
#   overlapping user sessions or when you don't care much about the grouping.
#   To track multiple overlapping user sessions, see "groupip" below.
#
#   A "condition" can be one of these condition words:
#
#       nopics      excludes all hits on image files  
#       nobots      excludes all hits by webbots
#       nome        excludes all hits from your own ip address
#       good        excludes all non-2xx (HTTP OK) hits
#       bad         excludes all 2xx (HTTP OK) hits
#       user        excludes all hits which are not in WORD's user directory.
#                   This is useful if your username happens to be a common
#                   substring in other people's files.
#
#   or it can be one of these (non-conditional) option commands:
#
#       ipname      tries to reverse-DNS the IP addresses (can be slow)
#       nosort      doesn't bother to sort the hits chronologically
#       groupip     reorders the hits to group together all accesses from 
#                   the same IP address.  The groups are ordered by the 
#                   time of their first hit (unless "nosort" is given)
#       nobsr       supresses printing of the browser identification string
#       noref       supresses printing of the referrer URL
#       noip        supresses printing of the IP address
#       noinfo      same as all three above commands together; just prints
#                      specific hit data, with no group info
#       murmur      supresses output for specific hits, but still prints
#                      IP, referrer, and browser info for groups
#       quiet       supresses all normal output
#       debug       complains about malformed log lines instead of just
#                      skipping them
#
#   or it can be a Perl expression involving these variables:
#
#       $ipaddr     ip address string ("xxx.xxx.xxx.xxx")
#       $ipname     hostname (will be equal to $ipaddr if "ipname" command
#                             is not given, or if reverse-DNS fails)
#       $request    HTTP request string
#       $page       requested pathname, abbreviated
#       $user       username whose directory the page is in, or "" if none
#       $browser    web browser identification string
#       $referrer   referring page URL
#       $search     search string if referred by search engine, or "" if not
#       $response   HTTP response code number
#       $seconds    time, in seconds since epoch
#       $date       date string ("mm/dd/yy")
#       $time       time string ("hh/mm", 24-hour time)
#       $logline    complete line in server log
#       $print      assign to this variable to print output before the 
#                      hit info is printed (but after group info)
#       $println    same as $print, but adds a newline
#       $end        assign to this variable to print output right before the
#                      program exits
#
#
#   Examples:
#
#       # this is how I normally look at my traffic
#       lgrep bret nopics nobots ipname groupip
#
#       # show only form submissions
#       lgrep bret '$request =~ /^POST/'
#
#       # show only Mozilla users and don't show image files or bad hits
#       lgrep bret good nopics '$browser =~ /mozilla/i'
#
#       # show only bad hits, and prefix each hit with the response code
#       lgrep bret bad '$print = "  $response: "'
#
#       # show all server activity at noon today
#       lgrep - '$time eq "12:00"'
#
#       # design your own custom output format
#       lgrep bret ipname quiet '$println = "$ipname ($ipaddr):  $date $time"'
#
#       # just print a count of unique human visitors
#       lgrep bret user nobots quiet '${z}{$ipaddr}++; $end = keys %z'
#
#       # see what sorts of perverted things people like to websearch
#       lgrep sex quiet '$println = $search'
#
#
#   Notes:
#
#     If you are not running this on UGCS, change the
#       my @logs = ...
#     line to a list of the pathnames to your actual server logs, or
#     pipe your server logs in through STDIN.
#
#     If you hook this up to a CGI script (or otherwise run it setuid),
#     be very VERY careful, since Perl conditions are eval'd directly.
#     Consider using the Safe module, or disallowing custom conditions
#     entirely.
#
#     This program was written quickly and sorta off the top of my head, 
#     so don't expect it to be bulletproof or even entirely correct.
#     Suggestions are welcome.
#


use Socket;
use Time::Local;


# set up useful variables

my $myname = `whoami` || $ENV{USER} || "";  chomp $myname;
my ($myip) = split ' ', $ENV{SSH_CLIENT} || "0.0.0.0";
my %months = (jan=>0, feb=>1, mar=>2, apr=>3,  may=>4, jun=>5, jul=>6,
              aug=>7, sep=>8, oct=>9, nov=>10, dec=>11);
my %ipcache;


# parse command line options

my $word = shift || die "usage: lgrep WORD [ CONDITIONS ]\n";
my %options;
my @conditions = grep { s/\$(\w+)/\$_->{$1}/g or /\W/ or 
                                              $options{$_}++ && 0 } @ARGV;


# read raw data from server logs, or from STDIN if it's pipe-like

my @logs = map "/ug/adm/log/www/$_/access_log",
               `ypmatch linuxclient netgroup` =~ /\w+/g  if -t STDIN;
my @lines = `grep -h $word @logs`;


# parse log data and convert to a list of hashes
#   (The nasty "eval" business is because there is no way to short-circuit
#    a map block.)

my @accesses = map {  my $hash = eval { 

    # parse log line

    my ($date, $ipaddr, $response, $request, $referrer, $browser) =
    /^
      \[ (.*?) \] \s     # date
      .+? \s             # ??? 1590
      .+? \s             # ??? 1
      (\S+) \s           # ip address
      (\d+) .+? \s       # HTTP response code
      .+? \s             # ??? 84539
      .+? \s             # ??? -
      " (.*?) " \s       # HTTP request (webpage)
      " (.*?) " \s       # referrer URL
      " (.*?) "          # browser name
    /x;
    die "What's up with this log line?\n$_\n" unless $ipaddr;


    # do the DNS or reverse-DNS, if desired

    my $ipname;
    if (!$options{ipname}) {
        $ipname = $ipaddr;
    } elsif ($ipaddr !~ /[^\d.]/) {
        $ipname = $ipcache{$ipaddr} ||=
            gethostbyaddr(inet_aton($ipaddr), AF_INET) || $ipaddr;
    } else {
        $ipname = $ipaddr;
        my $packedip = $ipcache{$ipname} ||= gethostbyname($ipname);
        $ipaddr = $packedip ? inet_ntoa($packedip) : "0.0.0.0";
    }


    # parse the date string

    my ($mday, $mon, $year, $hour, $min, $sec) =
        ($date =~ m| (\d+) / (\w+) / (\d+) : (\d+) : (\d+) : (\d+) |x);
    die "What's up with the date in this log line?\n$_\n" unless $mday;
    $mon = $months{lc $mon};
    my $seconds = timelocal($sec,$min,$hour,$mday,$mon,$year-1900);
    my $time = sprintf "%d:%02d", $hour, $min;
    $date = sprintf "%d/%d/%02d", $mon+1, $mday, $year%100;


    # abbreviate the target URL and figure out the target user

    my $page = uncode($request);
    my ($user) = ($page =~ m|^\w+ /~([\w\d]+)|);
    $user ||= "";
    $page =~ s|^\w+ /||  and  $page =~ s|^~$myname|~|o;
    $page =~ s| HTTP.*?$||;


    # figure out the search engine query string, if any

    $referrer = "" if length $referrer < 2;
    my $search = "";
    my (undef, $query) = split /\?/, $referrer;
    if ($query) {
        my $key;
        for ($referrer) {
            $key = /yahoo/         && 'p'   ||
                   /northernlight/ && 'qr'  ||
                   /webcrawler/    && 'qkw' ||
                   /askjeeves/     && 'ask' ||
                   /aj\.com/       && 'ask' ||
                                      'q';
        }
        if ($query =~ / (?:^|&) $key= ([^&]+) /x) {
            $search = uncode($1);
            $search =~ tr/+/ /;
            if ($search =~ /^cache:/) { $search = "" }
        }
    }
    $referrer = uncode($referrer);


    # construct the hash to represent this hit

    { ipaddr => $ipaddr,
      ipname => $ipname,
      request => $request,
      page => $page,
      user => $user,
      browser => $browser,
      referrer => $referrer,
      search => $search,
      response => $response,
      seconds => $seconds,
      date => $date,
      time => $time,
      logline => $_
    }

};  $@ ? ( $options{debug} ? die $@ : () ) : $hash;  }  @lines;


# sort list chronologically by time and then alphabetically by URL

unless ($options{nosort}) {
    @accesses = sort { $a->{seconds} <=> $b->{seconds} or
                       $a->{page}    cmp $b->{page}        } @accesses;
}


# group list by IP address, if desired

if ($options{groupip}) {
    my %seenip;
    @accesses = map {
        my $ipaddr = $_->{ipaddr};
        $seenip{$ipaddr}++ ? () : grep { $_->{ipaddr} eq $ipaddr } @accesses;
    } @accesses;
}


# go through list, evaluating conditions and printing output

my ($lastip, $lastref, $end) = ("","","");
ACCESS: foreach (@accesses) {

    # check preset condition words

    next if $options{good} and   $_->{response} !~ /2/;
    next if $options{bad} and    $_->{response} =~ /2/;
    next if $options{nome} and   $_->{ipaddr} eq $myip;
    next if $options{user} and   !$_->{user} || $_->{user} ne $word;
    next if $options{nopics} and $_->{page} =~ /\.gif$/i || 
                                 $_->{page} =~ /\.jpg$/i ||
                                 $_->{page} =~ /\.png$/i;
    next if $options{nobots} and $_->{browser} =~ /robot/i ||
                                 $_->{browser} =~ /spider/i ||
                                 $_->{browser} =~ /Googlebot/ ||
                                 $_->{browser} =~ /Ask Jeeves/ ||
                                 $_->{browser} =~ /WebCrawler/ ||
                                 $_->{browser} =~ /ZyBorg/ ||
                                 $_->{browser} =~ /Scooter/ ||
                                 $_->{browser} =~ /Linkbot/ ||
                                 $_->{browser} =~ /Slurp/ ||
                                 $_->{browser} =~ /Gigabot/ ||
                                 $_->{browser} =~ /Gulper/ ||
                                 $_->{browser} =~ /SlySearch/ ||
                                 $_->{browser} =~ /ia_archiver/ ||
                                 $_->{browser} =~ /psbot/;

    # check custom Perl conditions

    $_->{print} = $_->{println} = $_->{end} = "";
    for my $cond (@conditions) {
        my $result = eval $cond;
        die "bad condition:  $cond\nerror: $@logline: $_->{logline}\n" if $@;
        next ACCESS unless $result;
    }
    if (length $_->{println}) { $_->{print} = $_->{println} . "\n" }
    if (length $_->{end}) { $end = $_->{end} }

    if ($options{quiet}) { print $_->{print};  next }


    # break off a new group if the IP address has changed from the last one

    if ($_->{ipname} ne $lastip) {
        $lastip = $_->{ipname};
        $lastref = "";

        # print IP address

        unless ($options{noip} or $options{noinfo}) {
            print "\n$_->{ipname}";
            print " ($_->{ipaddr})" if $_->{ipaddr} ne $_->{ipname};
            print ":\n";
        }

        # print browser info

        unless ($options{nobsr} or $options{noinfo} or 
                length $_->{browser} < 2) {
            print "bsr: " . wrap($_->{browser}, 5) . "\n";
        }
    }

    # print referrer if it's interesting

    if (!$options{noref} and !$options{noinfo} and $_->{referrer} ne '-' and 
                                                  $_->{referrer} ne $lastref) {
        if (!$_->{user} or index($_->{referrer}, "/~$_->{user}") == -1) {
            $lastref = $_->{referrer};
            (my $ref = $_->{referrer}) =~ s|^http://||;
            print "ref: " . wrap($ref, 5) . "\n" if length $_->{referrer} > 2;
        }
    }


    # print specific hit info

    print $_->{print};
    print " " x (6 - length $_->{time}), "$_->{time}: ", 
          wrap($_->{page}, 8), "\n" unless $options{murmur};
}


# print last words and exit

print "$end\n";
exit;



# wraps and indents a line that would go off of an 80-character screen

sub wrap {
    my ($string, $indent) = @_;
    if (length $string > 80-$indent) {
        $string = substr($string, 0, 79-$indent)  .  "\n" .
                  " " x $indent  .  wrap(substr($string, 79-$indent), $indent);
    }
    return $string;
}

# uncodes embedded hex characters ("%7Ebret" to "~bret")

sub uncode {
    my ($string) = @_;
    $string =~ s/%([0-9A-Fa-f]{2})/((oct("0x$1")>31) ? chr(oct("0x$1")):'')/eg;
    return $string;
}

