#!/usr/bin/perl -w
# gnuhtml2latex html to latex converter
# Copyright (c) 1999 Tomasz Wgrzanowski <maniek@beer.com>
# Maintenance taken over by Gunnar Wolf, 2005
# Copyright (c) 2005-2010 Gunnar Wolf <gwolf@gwolf.org>
#
# gnuhtml2latex is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# On Debian GNU/Linux systems, the complete text of the GNU General
# Public License can be found in `/usr/share/common-licenses/GPL'.
#
# THIS IS VERY ALPHA

use strict;
use Getopt::Std;

getopts('a:bcf:gh:i:mno:pst:HPS:',\%main::opts);
unless (defined $main::opts{o}) { $main::opts{o} = '{article}' }
unless (defined $main::opts{h}) { $main::opts{h} = '' }
unless (defined $main::opts{f}) { $main::opts{f} = '' }
$main::num = ($main::opts{n})?'':'*';

{
my %tagstable_start = (
'p' => '\\par ',
'b' => '\\textbf{',
'i' => '\\textit{',
'u' => '\\underline{',
'dt' => '\\item[',
'dd' => ']',
'br' => '\\\\',
'em' => '\\emph{',
'h1' => "\\section${main::num}\{",
'h2' => "\\subsection${main::num}\{",
'h3' => "\\subsubsection${main::num}\{",
'h4' => "\\paragraph${main::num}\{",
'h5' => "\\subparagraph${main::num}\{",
'h6' => "\\subparagraph${main::num}\{",
'li' => '\\item ',
'ul' => '\\begin{itemize}',
'ol' => '\\begin{enumerate}',
'dl' => '\\begin{description}',
'tt' => '\\texttt{',
'kbd' => '{\\tt\\bf ',
'var' => '\\textit{',
'dfn' => '{\\bf\\it ',
'cite' => '{\\sc ',
'samp' => '\\texttt{',
'strong' => '\\textbf{',
'listing' => '\\begin{verbatim}',
'code' => '\\texttt{',
'pre' => '\\begin{verbatim}',
'blockquote' => '\\begin{quotation}'
);

my %tagstable_end = (
'b' => '}',
'i' => '}',
'u' => '}',
'em' => '}',
'h1' => '}',
'h2' => '}',
'h3' => '}',
'h4' => '}',
'h5' => '}',
'h6' => '}',
'tt' => '}',
'kbd' => '}',
'var' => '}',
'dfn' => '}',
'cite' => '}',
'samp' => '}',
'strong' => '}',
'ul' => '\\end{itemize}',
'ol' => '\\end{enumerate}',
'dl' => '\\end{description}',
'listing' => '\\end{verbatim}',
'code' => '}',
'pre' => '\\end{verbatim}',
'blockquote' => '\\end{quotation}'
);

if ($main::opts{m}) {
    $tagstable_start{'sub'} = '$_{\textrm{';
    $tagstable_end{'sub'} = '}}$';
    $tagstable_start{'sup'} = '$^{\textrm{';
    $tagstable_end{'sup'} = '}}$';
}

my $mode = 0;
my $firstfile = 1;
my $lastfile = 1;
my $substitution = 1;
package HTML::LatexMaker;
use HTML::Parser;
use HTML::Entities;
@HTML::LatexMaker::ISA = ( "HTML::Parser" );
1;

sub firstfile { my $self = shift; $firstfile = shift; }
sub lastfile { my $self = shift; $lastfile = shift; }

sub anchor_convert {
    my ($attr, $attseq)=@_;

    return unless defined($main::opts{H});
    return unless defined($attr->{href});

    printf "\\href{%s}{",$attr->{href};
}

sub image_convert {
    my ($attr, $attseq, $url, $caption, $localimg, $imgtype, $wget);
    ($attr, $attseq) = @_;

    return unless defined($main::opts{g});
    return unless defined($attr->{src});
    $wget = find_wget();

    $localimg = $url = $attr->{src};
    $localimg =~ s!(?:http|ftp)://!!;
    $localimg =~ s!\?.*!!;
    if ($localimg =~ s/\.(png|jpg|eps|gif|tif)$//) {
	$imgtype = $1;
    } else {
	warn "Cannot determine a valid image type for $url - Trying with .png";
	$imgtype = 'png'
    }

    $localimg =~ s![/?&.]!_!g;
    $localimg .= ".$imgtype" if $imgtype;
    if (-f $localimg) {
	warn "$localimg: Already here, skipping download\n";
    } elsif ($wget) {
	system($wget, $url, '-nv', '--load-cookies', '/tmp/wget.cookies', 
	       '-O', '-nc', $localimg);
    } else {
	warn "wget not found, you will need to create `$localimg'\n" .
	    "(Original URL: $url)\n";
    } 

    $caption = $attr->{title} || $attr->{alt} || 
	sprintf('\href{%s}{%s}', $url, $url);

    printf "
\\begin{figure}
\\centering
\\includegraphics[width=0.4\\textwidth]{%s}
\\caption{%s} 
\\end{figure}", $localimg, $caption;
}

sub find_wget {
    for my $path (split /:/,$ENV{PATH}) {
	my $wget = "$path/wget";
	return $wget if -x $wget;
    }
    warn "wget not found in path - No images will be downloaded\n";
    return undef;
}

sub start {
    my %tag_tbl;
    my ( $self, $tag, $attr, $attrseq ) = @_;

    %tag_tbl = (
	html     => sub { start_mode(1); return },
	head     => sub { start_mode(2); return },
	body     => sub { start_mode(3); return },
	pre      => sub { $substitution = 0; },
	listing  => sub { $substitution = 0; },
	a        => sub { anchor_convert($attr, $attrseq); return },
	img      => sub { image_convert($attr, $attrseq); return }
	);

    &{$tag_tbl{$tag}} if $tag_tbl{$tag};

    return unless( $mode == 3 and defined $tagstable_start{$tag} );
    print $tagstable_start{$tag};
}

sub end {
    my %tag_tbl;
    my ( $self, $tag ) = @_;

    %tag_tbl = (
	html    => sub { end_mode(0); return },
	head    => sub { end_mode(1); return },
	body    => sub { end_mode(1); return },
	pre     => sub { $substitution=1 },
	listing => sub { $substitution=1 },
	a       => sub { $main::opts{H} && do { print "}"; return } },
	);

    &{$tag_tbl{$tag}} if $tag_tbl{$tag};

    return unless( $mode == 3 and defined $tagstable_end{$tag} );
    print $tagstable_end{$tag};
}

sub text {
my ( $self, $text ) = @_;
return unless( $mode == 3 );

# Handle some things that decode_entities doesn't.
# (This needs to be done *before* calling decode_entities: otherwise
# there'd be no way of distinguishing `&FOO;' from `&amp;FOO;'.)

# We use `!' for internal purposes during entity translation.
$text =~ s/!|&\#(?:0*33|x0*21);/!bang;/g;

# Handle `&lsquo;&ldquo;', `&ndash;&mdash;' and so on by inserting
# thin space between the translations in such cases.
$text =~ s/&\#(?:x0*2d|0*45);/-/g;
$text =~ s/(&mdash;|&ndash;|-)(?=(?:&mdash;|&ndash;|-))/$1!thinsp;/g;
$text =~ s/(&[lr][sd]quo;)(?=(?:&[lr][sd]quo;))/$1!thinsp;/g;

# There are many things that decode_entities doesn't handle.
# A few of those things we handle ourselves.  The final replacement
# happens later (so that we correctly handle the various quotes
# whether they're literal, numeric character ref, or symbolic ref).
# In the meantime we change from `&FOO;' to `!FOO;'.
$text =~ s/&([mn]dash|[lr][sd]quo|hellip);/!$1;/g;

$text = decode_entities($text);

$text =~ s/\\/!backslash;/g;

# Does not work properly.
#	$text =~ s/([~\`\'\"]+)/!verb|$1|/g;  
if ($substitution) {
    $text =~ s/([_&%\{\}\#])/\\$1/g;
}
$text =~ s/\$/\\\$/g;
$text =~ s/\^/\\^{}/g;
$text =~ s/!backslash;/\$\\backslash\$/g;
$text =~ s/!mdash;/---/g;
$text =~ s/!ndash;/--/g;
$text =~ s/!lsquo;/`/g;  #`;
$text =~ s/!rsquo;/'/g;  #';
$text =~ s/!ldquo;/``/g;
$text =~ s/!rdquo;/''/g;
$text =~ s/!hellip;/\ldots{}/g;
$text =~ s/!thinsp;/\$\\,\$/g;
#	$text =~ s/!verb|/\\verb|/g;
$text =~ s/!bang;/!/g;
$text =~ s/\xa0/~/g;
#$text =~ s/>/\$>\$/g;
	    
# Whatever looks like an URL should be made into one
$text =~ s![[{]?((?:http|ftp)://\S+)[\]}]?!\\url{$1}!g;

print $text;
}

sub start_mode {
    my ( $mode_new, $skip_pre );
    ($mode_new) = @_;
    $skip_pre = $main::opts{P};

    if ( $mode_new == 1 && $firstfile) {
	print "% This file was converted from HTML to LaTeX with\n" .
	    "% gnuhtml2latex program\n" .
	    "% (c) Tomasz Wegrzanowski <maniek\@beer.com> 1999\n" .
	    "% (c) Gunnar Wolf <gwolf\@gwolf.org> 2005-2010\n" .
	    "% Version : $main::version.\n";

	if (!$skip_pre) {
	    print '\documentclass'.$main::opts{o}."\n";
	    print "\\usepackage{hyperref}\n" if $main::opts{H};
	    if ($main::opts{g}) {
		print "\\usepackage{graphicx}\n";
		print "\\DeclareGraphicsExtensions{.png,.jpg,.eps,.gif,.tif}\n";
	    }
	}
    }

    if ( $mode_new == 3 && $firstfile) {
	print "\\begin{document}\n" unless $skip_pre;
	print $main::opts{h};

	if ( defined $main::opts{a} or defined $main::opts{t} or
	     defined $main::opts{c} ){
	    if ( defined $main::opts{a} or defined $main::opts{t} ) {
		print ('\\title{'.$main::opts{t}.'}') if $main::opts{t};
		print ( '\\author{'.($main::opts{a} or '')."}\n\\maketitle" );
	    }
	    if ( $main::opts{c} ) { print "\n\\tableofcontents\n" }
	}
	if ( $main::opts{p} ) { print "\n\\newpage" }
    }
    $mode = $mode_new;
}

sub end_mode {
    my ( $mode_new, $skip_post);
    ($mode_new ) = @_;
    $skip_post = $main::opts{P};
    if ( $mode == 3 && $lastfile ) {
	print $main::opts{f};
	print "\\end{document}\n" unless $skip_post;
    }
    $mode = $mode_new;
}

}

$main::version = '0.4';

if ( $main::opts{i} ) {
    open FILE, $main::opts{i} or 
	die "$main::opts{i}  $1";
    @ARGV=<FILE>;
    close FILE;
}

if ( $main::opts{b} ) {
    if (@ARGV>=1) {
        my $filename=$ARGV[0];
        open FILE, $filename or die "$filename $!";
        $filename =~ s/\.html?$//;
        my $outfile = $filename.".tex";
        unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
        my $doc = new HTML::LatexMaker;
	$doc->ignore_elements($main::opts{S}) if $main::opts{S};
	$doc->lastfile(0);
        $doc->parse_file (\*FILE);
        $doc->firstfile(0);
        close FILE;
	for (my $i=1; $i < @ARGV-1; $i++) {
	    $filename=$ARGV[$i];
	    open FILE, $filename or next;
	    $filename =~ s/\.html?$//;
	    $outfile = $filename.".tex";
	    unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
	    $doc->parse_file (\*FILE);
	    close FILE;
	}
	$filename=$ARGV[@ARGV-1];
        open FILE, $filename or die;
        $filename =~ s/\.html?$//;
        $outfile = $filename.".tex";
        unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
        $doc->lastfile(1);
        $doc->parse_file (\*FILE);
    }
} else {
    foreach my $filename(@ARGV) {
	open FILE, $filename or next;
	$filename =~ s/\.html?$//;
	my $outfile = $filename.".tex";
	unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
	my $doc = new HTML::LatexMaker;
	$doc->ignore_elements($main::opts{S}) if $main::opts{S};
	$doc->parse_file (\*FILE);
	close FILE;
    }
}

=head1 NAME

gnuhtml2latex - html to latex converter

=head1 SYNOPSIS

B<gnuhtml2latex> F<[options]> F<filename>

=head1 OPTIONS

=over

=item -a [author]

speecify document author

=item -b

Process more than one input HTML file (they all get concatenated and
written to a single output file, or to STDOUT if F<-s> is set)

=item -c

Use table of contents

=item -f [string]

Specify foonote

=item -h [string]

Specify header

=item -i filename

Get the list of files to be converted from the specified filename

=item -m

Allow the use of some tags that require entering and exiting math mode.

Currently, the superscript and subscript tags are achieved by using
the math mode. Now, using the math mode can break some
formatting. Math mode will only be entered in the output document if
you specify this switch.

=item -n

Use numbered sections

=item -H

use hyperref package to process anchors


=item -g

Include images. If wget is installed, it will be used in order to
download the images; otherwise, their position will just be marked in
the resulting TeX document.

=item -o [string]

Specify document style

=item -p

Break page after title / table of contents

=item -P

Partial / plain: Omit preamble and postamble. Note that F<-P> makes
F<-H> and F<-o> meaningless (as they act in the preamble)

=item -S

Skip (ignore) the specified comma-separated tags, along with all of
their content.

=item -s

Write to STDOUT instead of to inputfilename.tex

=item -t [title]

Specify title of document

=back

=head1 DESCRIPTION

This aims to be replacement of html2latex.

Program takes html file foo.html or foo.htm file
and makes latex file foo.tex from it

=head1 NOT VERY AMBITIOUS TODO

For people who want only functionality of original html2latex

 bugfixes - Im sure there is plenty of bugs inside
 clueful backslash escaping
 more entities from outside of iso-8895-1
 tables
 performance boost
 and a lot more

=head1 MORE AMBITIOUS TODO

For people who want a real tool

 make it part of some html processor

=head1 FUTURE OF THIS PACKAGE

This is very possible that functions of this package will be included
to some more general project. This package was made mainly to make world
a bit more free.

=cut
