#! /usr/bin/perl
###############################################################################
#
#
#
###############################################################################


# I've got the power!
use Ferret;


# Program-wide variable declarations
use FileHandle;
$count	= 0;			# Counter for periodic flushing
@urllist;				# List of urls yet to process
%urldone;				# List of processed urls (to avoid looping)
STDOUT->autoflush(1);


# A subroutine to validate URLs
sub CleanURL {
	my($from,$to) = @_;

	$to = Ferret::ResolveURL($from,$to);

	# Remove CGI arguments & tags
	$to =~ s/[?\#].*$//;

	my($ftype,$fhost,$fport,$ffile, $ttype,$thost,$tport,$tfile);
	($ftype,$fhost,$fport,$ffile) = ($from =~ m|^(\w+:)?(//[^/:]*)?(:\d+)?(.*)$|i);
	($ttype,$thost,$tport,$tfile) = ($to   =~ m|^(\w+:)?(//[^/:]*)?(:\d+)?(.*)$|i);

#	print "type=$ftype, host=$fhost, port=$fport, file=$ffile -- ";
#	print "type=$ttype, host=$thost, port=$tport, file=$tfile\n";

	return unless $ttype =~ m/^(|http:)$/i;
	return unless $fhost eq $thost || ($mpatt && $thost =~ m!^$mpatt$!i);
	return if $avoid && $tfile =~ m!$avoid!i;

#	print "return: $to\n";
	return $to;
}



###############################################################################
#
# Begin main program
#
###############################################################################



# Parse command line arguments
my $usage = qq"
Use: $0 [--index=<index-file>] [--machines=<allowed-machine-pattern>]
     [--avoid=<disallowed-filename-pattern>] [--delay=<delay-time>]
     [--summary=<max-summary-bytes>] [--lines=<max-summary-lines>]
     [--name=<user's-name>] [--email=<contact-email-address>] [--shrink]
     [--refresh=<url-pattern>] <new-url> [...]

";

$index = 'ferret.index';
$mpatt = '';
$avoid = '';
$ssize = '';
$lines = '';
$sleep = 2;
$flush = 100;
$fresh = '';
$shrnk = 0;
$uname = '';
$email = '';


die $usage unless @ARGV > 0;

foreach (@ARGV) {
	if (/^--index=(.+)/)	{ $index = $1;	next; }
	if (/^--machines=(.+)/)	{ $mpatt = $1;	next; }
	if (/^--avoid=(.+)/)	{ $Avoid = $1;	next; }
	if (/^--summary=(.+)/)	{ $ssize = $1;	next; }
	if (/^--lines=(.+)/)	{ $lines = $1;	next; }
	if (/^--delay=(.+)/)	{ $sleep = $1;	next; }
	if (/^--refresh=(.+)/)	{ $fresh = $1;	next; }
	if (/^--shrink$/)		{ $shrnk =  1;	next; }
	if (/^--name=(.+)/)		{ $uname = $1;	next; }
	if (/^--email=(.+)/)	{ $email = $1;	next; }
	if (/^--flush=(.+)/)	{ $flush = $1;	next; }
	if (/^--/)				{ die $usage;	next; }

	$_ = "//$_" unless m|/|;
	my $url = CleanURL($_);
	die "Error: Invalid url '$_'\n" unless $url;

	push @urllist,$url;
}
$avoid = $Avoid if $Avoid;
if ($uname) {
	$uname = "Ferret/$Ferret::VERSION ($uname)";
}


# Load the big guns...
$search = new Ferret;
$search->Update($index);


# A friendly reminder
unless ($uname && $email) {
	print "It is considered good netiquette when crawling the web to include both your\n";
	print "name and an email address at which you can be contacted.\n";
}


# If we're doing a "refresh", load the appropriate urls into the list
push @urllist, grep(/$fresh/,$search->DocumentList()) if $fresh;


# Loop until no more URLs
while (@urllist) {
	my $url  = shift @urllist;
	next unless $url;
	next if $urldone{$url};
	my $lurl = $url;
	my $write= 1;

	# Build GET options
	my $opts = "";
	$opts .= "User-agent: $uname\n" if $uname;
	$opts .= "From: $email\n" if $email;
	$opts .= "If-Modified-Since: " . Ferret::TimetoRFC1123($search->DocumentTimestamp($url)) . "\n";
	$opts .= "Accept: application/x-mif, text/html, text/plain\n";

	print "\n$url: ";
	my $data = eval { Ferret::LoadHTTP($url,$opts) };
	my $size = (length $data) / 1024;
	printf "(%0.1fk) ",$size;
	if ($@) {
		$urldone{$url} = 1;
		print STDERR "$@\n";
		next;
	}

	$data =~ s/\r//g;
	my($head) = ($data =~ m!^(.*?\n)\n!s);
	my($type) = ($head =~ m!^Content-type:\s+(.*)!im);
	my($rslt) = ($head =~ m!^HTTP\S*\s+(\d+)!im);

	$urldone{$url} = 1;
	if ($head =~ m/^Location: (.*)/im) { $url = $1; }

	if ($rslt == 301 || $rslt == 302) {
		print "-> $url ";
#		print "[" . CleanURL($lurl,$url) . "] ";
		push @urllist, CleanURL($lurl,$url);
		$search->RemoveDocument($lurl);
		$search->DBDelSummary("$lurl");
		next;
	} elsif ($rslt == 304) {
		print "-- Unchanged ";
		next;
	}

	$urldone{$url} = 1;
	$search->RemoveDocument($url);
	$search->DBDelSummary("$url");

	if ($rslt != 200 && $rslt != 304) {
		$head =~ m!^HTTP\S*\s+(.*)!im;
		print STDERR "Error: Could not load '$url' -- $1\n";
		next;
	}

	unless ($type) {
		print STDERR "Error: '$url' return with no content-type\n";
		next;
	}
	$data =~ s/^.*?\n\n//s;
	study $data;


	my($title,$summary,$newurl);
	if ($type eq "text/html") {
		print '"HTML" ';
#		$data =~ s/^(.{1,65535}).*$/$1/s;
		my @hrefs = ($data =~ m/<a\s[^>]*href=\"?([^>\"\s\#\?]+).*?>/gis);
		if (@hrefs) {
			printf "(%u hrefs", scalar @hrefs;
			@hrefs = Ferret::Uniq(sort(@hrefs));
#			if (@hrefs) {
				printf ", %u unique) ", scalar @hrefs;
#			} else {
#				print ", all local) ";
#			}
			foreach (@hrefs) {
#				print "($_)\n";
				$newurl = CleanURL($url,$_);
				unless ($newurl =~ m!\.(jpg|jpeg|gif|png|bmp)$!) {
					push @urllist, $newurl;
#					print "($newurl)\n";
				}
			}
		}

		Ferret::StripHTML(\$data,\$title,\$summary,$ssize);
		Ferret::ResolveHTMLImages(\$summary,$url,0.75,100);
		Ferret::MakeHTMLSummary(\$summary,1);
		$search->AddDocument($url,$data);
	} elsif ($type eq "text/plain") {
		print '"Text" ';
		if ($url =~ m/\.(h|hpp|h\+\+|c|cpp|c\+\+)$/i || $data =~ m/^\#\!/) {
			Ferret::StripCode(\$data,\$title,\$summary, ($lines ? $lines*400 : $ssize));
		} else {
			Ferret::StripText(\$data,\$title,\$summary, ($lines ? $lines*400 : $ssize));
		}
		Ferret::LimitLineCount(\$summary,$lines) if $lines;
		Ferret::MakeHTMLSummary(\$summary);
		$search->AddDocument($url,$data);
	} elsif ($type eq "application/x-mif") {
		print '"MIF" ';
		Ferret::StripMIF(\$data,\$title,\$summary, ($lines ? $lines*400 : $ssize));
		Ferret::LimitLineCount(\$summary,$lines) if $lines;
		Ferret::MakeHTMLSummary(\$summary);
		$search->AddDocument($url,$data);
	} else {
		print STDERR "Warning: Unknown content-type '$type' from $url\n";
		next;
	}

#	print "Summary: $summary\n";
	$search->DBPutSummary("$url",$summary) if $summary;

	if ($count++ == 100) {
		print "Writing database... ";
		$search->Flush();
		$count = 0;
	}

	sleep $sleep if $sleep > 0;
}



# Time to put it all away...  May take a while to write the database.
print "\nWriting database...\n";
$search->Shrink() if $shrnk;
$search->Close();



