#!/usr/bin/perl
eval 'exec /usr/bin/perl -S $0 ${1+"$@"}'
    if $running_under_some_shell;
##
##  htmlstrip -- Strip HTML markup code
##  Copyright (c) 1997,1988 Ralf S. Engelschall, All Rights Reserved. 
##

require 5.003;

BEGIN { $^W = 0; } # get rid of nasty warnings

use lib "/usr/lib/wml/perl/lib";
use lib "/usr/lib/wml/perl/lib/m68k-linux/5.004";
use lib "/usr/local/lib/site_perl";
use lib "/usr/local/lib/site_perl/m68k-linux";

use Getopt::Long 2.13;
use IO::Handle 1.15;
use IO::File 1.06;

#
#   process command line
#
sub usage {
    print STDERR "Usage: htmlstrip [options] [file]\n";
    print STDERR "\n";
    print STDERR "Options:\n";
    print STDERR "  -o, --outputfile=<file>   set output file instead of stdout\n";
    print STDERR "  -O, --optimize=<level>    set optimization/crunch level\n";
    print STDERR "  -v, --verbose             verbose mode\n";
    exit(1);
}
$opt_v = 0;
$opt_o = '-';
$opt_O = 2;
$Getopt::Long::bundling = 1;
$Getopt::Long::getopt_compat = 0;
if (not Getopt::Long::GetOptions(
    "v|verbose",
    "O|optimize=i",
    "o|outputfile=s")) {
    &usage;
}

sub verbose {
    my ($str) = @_;
    if ($opt_v) {
        print STDERR "** HTMLstrip:Verbose: $str";
    }
}

#
#   read input file
#
if (($#ARGV == 0 and $ARGV[0] eq '-') or $#ARGV == -1) {
    $in = new IO::Handle;
    $in->fdopen(fileno(STDIN), 'r');
    local ($/) = undef;
    $INPUT = <$in>;
    $in->close;
}
elsif ($#ARGV == 0) {
    $in = new IO::File;
    $in->open($ARGV[0]);
    local ($/) = undef;
    $INPUT = <$in>;
    $in->close;
}
else {
    &usage;
}

#
#   global initial stripping
#

#   strip sharp-like comments
$INPUT =~ s|^\s*#.*$||mg;

#
#   stripping functions for particular areas
#

#   Strip Plain Text, i.e. outside of any 
#   preformatted area and outside any HTML tag.
sub StripPlainText {
    my ($buf) = @_;

    #   Level 0
    #if ($opt_O >= 0) {
    #}
    #   Level 1
    if ($opt_O >= 1) {
        #   strip empty lines
        $buf =~ s|\n\s*\n|\n|sg;
    }
    #   Level 2
    if ($opt_O >= 2) {
        #   strip multiple whitespaces to single one
        $buf =~ s|(\S+)[ \t]{2,}|$1 |sg;
        #   strip trailing whitespaces
        $buf =~ s|\s+\n|\n|sg;
    }
    #   Level 3
    if ($opt_O >= 3) {
        #   strip leading whitespaces
        $buf =~ s|^\s+||mg;
    }
    #   Level 4
    if ($opt_O >= 4) {
        #   strip empty lines again
        $buf =~ s|^\s*$||mg;
        $buf =~ s|\n\n|\n|sg;
    }
    #   Level 5
    if ($opt_O >= 5) {
        #   concatenate all lines
        $buf =~ s|\n| |sg;
        #  
        $from = $buf;
        $line = '';
        $buf = '';
        sub nexttoken {
            my ($buf) = @_;
            my ($token, $bufN);

            if ($buf =~ m|^([^<]+?)(<.+)$|s) {
                $token = $1;
                $bufN  = $2;
            }
            elsif ($buf =~ m|^(<[^>]+>)(.*)$|s) {
                $token = $1;
                $bufN  = $2;
            }
            else {
                $token = $buf;
                $bufN  = '';
            }

            if (length($token) > 80) {
                $x = substr($token, 0, 80);
                $i = rindex($x, ' ');
                $bufN = substr($token, $i) . $bufN;
                $token = substr($token, 0, $i);
            }
            return ($token, $bufN);
        }
        while (length($from) > 0) {
            ($token, $from) = &nexttoken($from);
            if ((length($line) + length($token)) < 80)  {
                $line .= $token;
            }
            else {
                $buf .= $line . "\n";
                $line = $token;
            }
        }
        $buf =~ s|^\s+||mg;
        $buf =~ s|\s+$||mg;
    }

    return $buf;
}

#   Strip HTML Tag, i.e. outside of any 
#   preformatted area but inside a HTML tag.
sub StripHTMLTag {
    my ($buf) = @_;

    #   Level 0
    #if ($opt_O >= 0) {
    #}
    #   Level 1
    #if ($opt_O >= 1) {
    #}
    #   Level 2
    if ($opt_O >= 2) {
        #   strip multiple whitespaces to single one
        $buf =~ s|(\S+)[ \t]{2,}|$1 |mg;
        #   strip trailing whitespaces at end of line
        $buf =~ s|\s+\n|\n|sg;
        #   strip whitespaces between attribute name and value
        $buf =~ s|([ \t]+[a-zA-Z][a-zA-Z0-9_]*)\s*=\s*|$1=|sg;
        #   strip whitespaces before tag end
        $buf =~ s|[ \t]+>$|>|sg;
    }
    #   Level 3
    #if ($opt_O >= 3) {
    #}
    #   Level 4
    if ($opt_O >= 4) {
        #   strip HTML comments
        $buf =~ s|<!--.+?-->||sg;
        #   strip newlines before tag end
        $buf =~ s|\n>$|>|sg;
    }
    #   Level 5
    #if ($opt_O >= 5) {
    #}

    return $buf;
}

#   Strip Preformatted Areas, i.e.  inside 
#   <pre>, <xmp> and <nostrip> container tags.
sub StripPreformatted {
    my ($buf) = @_;

    #   Level 0
    #if ($opt_O >= 0) {
    #}
    #   Level 1
    #if ($opt_O >= 1) {
    #}
    #   Level 2
    if ($opt_O >= 2) {
        #   strip trailing whitespaces on non-empty lines
        $buf =~ s|([^\s]+)[ \t]+\n|$1\n|sg;
    }
    #   Level 3
    #if ($opt_O >= 3) {
    #}
    #   Level 4
    #if ($opt_O >= 4) {
    #}
    #   Level 5
    #if ($opt_O >= 5) {
    #}

    return $buf;
}

#
#   Processing Loop
#
%TAGS = (
    "nostrip" => { BEGIN => "<nostrip>", END => "</nostrip>", REMOVE => 1 },
    "pre"     => { BEGIN => "<pre>",     END => "</pre>",     REMOVE => 0 },
    "xmp"     => { BEGIN => "<xmp>",     END => "</xmp>",     REMOVE => 0 },
);

$OUTPUT = '';

sub StripNonPreformatted {
    my ($I) = @_;
    my ($O);

    $O = '';
    while ($I =~ m|^(.*?)(<.+?>)(.*)$|s) {
        $I = $3;
        $O .= &StripPlainText($1);
        $O .= &StripHTMLTag($2);
    }
    $O .= &StripPlainText($I);
    return $O;
}

while (1) {
    #   look for a begin tag
    $len = length($INPUT);
    $pos = $len;
    foreach $tag (keys(%TAGS)) {
        if ($INPUT =~ m|^(.*?)($TAGS{$tag}->{BEGIN})(.*)$|is) {
            $n = length($1);
            if ($n < $pos) {
                $pos = $n;
                $prolog = $1;
                $curtag = $2;
                $epilog = $3;
                $tagname = $tag;
            }
        }
    }
    if ($pos < $len) {
        $o = &StripNonPreformatted($prolog);
        if (substr($OUTPUT, length($OUTPUT)-1, 1) eq "\n" and
            substr($o, 0, 1) eq "\n") {
            $o = substr($o, 1, length($o)-1); 
        }
        $OUTPUT .= $o;
        if (not $TAGS{$tagname}->{REMOVE}) {
            $OUTPUT .= $curtag;
        }
        $INPUT = $epilog;
        if ($INPUT =~ m|^(.*?)($TAGS{$tagname}->{END})(.*)$|is) {
            $OUTPUT .= &StripPreformatted($1);
            if (not $TAGS{$tagname}->{REMOVE}) {
                $OUTPUT .= $2;
            }
            $INPUT = $3;
        }
        next;
    }
    else {
        $o = &StripNonPreformatted($INPUT);
        if (substr($OUTPUT, length($OUTPUT)-1, 1) eq "\n" and
            substr($o, 0, 1) eq "\n") {
            $o = substr($o, 1, length($o)-1); 
        }
        $OUTPUT .= $o;
        last;
    }
}

#
#   global final stripping
#
$OUTPUT =~ s|\s*<suck>\s*||isg;
$OUTPUT =~ s|^\n||s;

#
#   write to output file
#
if ($opt_o eq '-') {
    $out = new IO::Handle;
    $out->fdopen(fileno(STDOUT), "w");
}
else {
    $out = new IO::File;
    $out->open(">$opt_o");
}
$out->print($OUTPUT);
$out->close;

exit(0);

##EOF##
