#! /usr/bin/perl -w

# vim:syntax=perl

use strict;

use lib '/usr/share/perl5';

use Lire::Program qw( :msg $PROG );
use Lire::DlfSchema;
use Lire::DataTypes qw( :time );
use Getopt::Long;
use POSIX qw( strftime );


my %opts = ( period => "1d" );

GetOptions( \%opts, "period=s" )
  or lr_err( "Usage: $PROG [--period period] <superservice>" );

lr_err( "$PROG: invalid period: $opts{period}" )
  unless check_duration( $opts{period} );

my $superservice = shift
  or lr_err( "Usage: $PROG [--period period] <superservice>" );

my $schema = eval {Lire::DlfSchema::load_schema( $superservice );};
lr_err( "$PROG: error loading schema : $@" ) if $@;

my $time_idx	    = $schema->timestamp_field->pos();
my @fields	    = @{ $schema->fields };
my @group_fields = map { $_->pos } grep {
		     $_->type =~ /^(string|ip|hostname|url|email|filename)$/;
		   } @{$schema->fields};
my @field_stats	    = ();
my @global_stats    = ();

my $dlf_count = 0;
my $period = duration2sec( $opts{period} );
my $period_start = 0;
my $period_record_start = 0;
my $dlf_time_start = 0;
my $time;
while (<>) {
    chomp;
    $dlf_count++;

    lr_info( "processing ${dlf_count}th records..." )
      unless $dlf_count % 5000;

    my @dlf = split /\s+/;

    $time = $dlf[$time_idx];
    if ( $period_start + $period < $time) {
	print_stats( $period_start, $time, $dlf_count - $period_record_start,
		     \@field_stats )
	  if ( $period_start );

	# Finalize the stats
	# Reset counters
	$period_start = $time;
	$dlf_time_start = $time unless $dlf_time_start;
	$period_record_start = $dlf_count;
	@field_stats = ();
    }

    foreach my $field_idx ( @group_fields ) {
	my $key	    = $dlf[$field_idx];
	my $key_len = length $key;
	my $stats   = $field_stats[$field_idx]  ||= {
						     lr_keys	    => 0,
						     lr_key_length  => 0,
						    };
	unless (exists $stats->{$key}) {
	    $stats->{$key}++;
	    $stats->{lr_keys}++;
	    $stats->{lr_key_length} += $key_len;
	    $global_stats[$field_idx]{lr_keys}++;
	    $global_stats[$field_idx]{lr_key_length} += $key_len;
	}
    }
}

if ( $dlf_count ) {
    print_stats( $period_start, $time, $dlf_count - $period_record_start,
		 \@field_stats );

    print "=" x 15, "\n";

    print "Sumarry\n";
    print_stats( $dlf_time_start, $time, $dlf_count, \@global_stats );
}

lr_info( "processed $dlf_count records" );

exit 0;

sub print_stats {
    my ( $the_start, $the_end, $count, $the_stats ) = @_;

    my $start	= strftime '%x', localtime $the_start;
    my $end	= strftime '%x', localtime $the_end;

    print "Period: $start - $end\n";
    print "DLF records: $count\n";
    printf "%-20s %-12s %-8s %-10s %-10s\n", "Field", "Type", 'Key %', "Key Size", "Avg Key Length";
    foreach my $field_idx ( @group_fields ) {
	my $stats	= $the_stats->[$field_idx];
	my $key_ratio	= $stats->{lr_keys} * 100 / $count;
	my $avg_len	= $stats->{lr_key_length}/ $stats->{lr_keys};
	my $key_size	= $stats->{lr_key_length};
	if ($key_size > 1024*1024 ) {
	    $key_size = sprintf '%.2fm', $key_size / (1024*1024);
	} elsif ($key_size > 1024 ) {
	    $key_size = sprintf '%.2fk', $key_size / 1024;
	}

	printf "%-20s %-12s %5.2f %% %10s %10.2f\n", $fields[$field_idx]->name,
	  $fields[$field_idx]->type, $key_ratio, $key_size, $avg_len;
    }
    print "\n";
}

# vim:syntax=perl
# Local Variables:
# mode: cperl
# End:

__END__

=pod

=head1 NAME

lr_dlf_analyze - analyze the key distribution and key length of a dlf file

=head1 SYNOPSIS 

B<lr_dlf_analyze> B<[--period> I<p>B<]> I<superservice>

=head1 DESCRIPTION

B<lr_dlf_analyze> is a tool for Lire developers.  It analyzes the key
distribution and key length of the dlf inputs to have a better idea of what
is going on.

The way the tool works is that it analyzes a DLF file for fields that
are likely to be use as key in group element. (It doesn't make sense
to analyze the distribution of int, bytes or time field since they
can't be used as discrete key).

It compute the number of different keys for a field in a given period,
the average key length, and the total memory used by the keys. It outputs
a summary after each period and at a summary at the end.

The period I<p> can be string like, 1w, 2d, 36h, etc.

=head1 EXAMPLE

Running on a 500,000 records dlf:

 [francis@Arendt tests]$ lr_dlf_analyze -p 16w www <combined.dlf > stats
 all all UNSET lr_dlf_analyze info started with -p 16w www
 all all UNSET lr_dlf_analyze info processing 5000th records...
 all all UNSET lr_dlf_analyze info processing 10000th records...
 ...
 all all UNSET lr_dlf_analyze info processing 565000th records...
 all all UNSET lr_dlf_analyze info processed 568688 records
 all all UNSET lr_dlf_analyze info memory stats: vsize=7644K rss=6232K majflt=407
 all all UNSET lr_dlf_analyze info elapsed time in seconds real=87 user=71.29 system=1.25
 all all UNSET lr_dlf_analyze info stopped
 
 [francis@Arendt tests] cat stats
 Period: 2000-11-01 - 2001-02-21
 DLF records: 69558
 Field                Type         Key %    Key Size   Avg Key Length
 client_host          hostname      1.12 %     13.61k      17.97
 client_domain        hostname      0.78 %      5.27k       9.89
 who                  string        0.00 %          1       1.00
 http_action          string        0.01 %         24       4.00
 requested_page       url           4.28 %     97.17k      33.39
 requested_page_ext   string        0.05 %        164       4.82
 requested_file       filename      2.04 %     47.34k      34.24
 http_protocol        string        0.00 %         17       5.67
 referer              string        1.66 %     61.93k      54.76
 useragent            string        1.02 %     20.31k      29.41
 gzip_result          string        0.00 %         13      13.00
 
 Period: 2001-02-21 - 2001-06-13
 DLF records: 239613
 Field                Type         Key %    Key Size   Avg Key Length
 client_host          hostname      1.60 %     78.47k      20.92
 client_domain        hostname      0.95 %     22.05k       9.91
 who                  string        0.00 %          1       1.00
 http_action          string        0.00 %         20       4.00
 requested_page       url           2.79 %    239.61k      36.74
 requested_page_ext   string        0.02 %        497       8.42
 requested_file       filename      1.35 %    121.63k      38.37
 http_protocol        string        0.00 %         22       5.50
 referer              string        1.35 %    199.91k      63.10
 useragent            string        0.47 %     54.71k      50.20
 gzip_result          string        0.00 %         13      13.00
 
 Period: 2001-06-13 - 2001-09-02
 DLF records: 259516
 Field                Type         Key %    Key Size   Avg Key Length
 client_host          hostname      2.35 %    121.29k      20.38
 client_domain        hostname      1.45 %     36.96k      10.06
 who                  string        0.00 %          1       1.00
 http_action          string        0.00 %      1.07k     109.80
 requested_page       url           1.85 %    185.11k      39.38
 requested_page_ext   string        0.06 %      2.65k      17.52
 requested_file       filename      0.93 %     93.90k      39.77
 http_protocol        string        0.00 %         24       6.00
 referer              string        1.13 %    187.05k      65.57
 useragent            string        0.52 %     66.51k      50.79
 
 gzip_result          string        0.00 %         13      13.00
 
 ===============
 Sumarry
 Period: 2000-11-01 - 2001-09-02
 DLF records: 568688
 Field                Type         Key %    Key Size   Avg Key Length
 client_host          hostname      1.88 %    213.38k      20.40
 client_domain        hostname      1.16 %     64.28k      10.00
 who                  string        0.00 %          3       1.00
 http_action          string        0.00 %      1.12k      54.38
 requested_page       url           2.54 %    521.88k      36.93
 requested_page_ext   string        0.04 %      3.30k      13.61
 requested_file       filename      1.24 %    262.87k      38.02
 http_protocol        string        0.00 %         63       5.73
 referer              string        1.29 %    448.89k      62.77
 useragent            string        0.56 %    141.53k      45.80
 gzip_result          string        0.00 %         39      13.00


=head1 SEE ALSO

This post on the LogReport development list:
http://www.nlnet.nl/projects/logreport/hypermail/logreport/development/0335.html

=head1 VERSION

$Id: lr_dlf_analyze.in,v 1.3 2001/10/18 19:13:17 flacoste Exp $

=head1 COPYRIGHT

Copyright (C) 2001 Stichting LogReport Foundation LogReport@LogReport.org
 
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
 
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
 
You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html or write to the Free Software 
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.

=head1 AUTHOR

Francis J. Lacoste <flacoste@logreport.org>

=cut

