File:
[Local Repository] /
comics /
fetch.pl.new
Revision
1.1.1.1 (vendor branch):
download - view:
text,
annotated -
select for diffs
Thu Aug 18 12:39:12 2011 UTC (13 years, 4 months ago) by
nick
Branches:
INITIAL
CVS tags:
v1_0
Initial import. THough I've been maintaining this script since late 2004 and it's undergone MANY revisions I never added it to CVS. Most recently I updated this script to work with the updated changes to gocomics website, I added USER_AGENT strings and only one of the two download functions had the 'referer' being passed, I added it to the second one as well.
#!/usr/bin/perl -w
use strict;
use File::Path;
use Data::Dumper;
##
## Some default values
##
my $ver = q/$Id: fetch.pl.new,v 1.1.1.1 2011/08/18 12:39:12 nick Exp $/;
my $comicFile = "comics.conf";
my %comics = &readComicConfig ( $comicFile );
my %dates = &fetchDates();
my $baseDir = $comics{'configs'}{'base_directory'} || ".";
my $imageDir = $baseDir . "/" . ( $comics{'configs'}{'image_directory'} || "images" ) .
"/$dates{'mon2'}$dates{'year2'}";
my $indexDir = $baseDir . "/" . ( $comics{'configs'}{'index_directory'} || "indexes" );
my $USER_AGENT = "ozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18) Gecko/20110628 Ubuntu/10.10 (maverick) Firefox/3.6.18";
my $DATE=`date`; chomp $DATE;
print STDOUT "Starting comic fetch at $DATE\n";
##
## Main program starts here
##
&checkDir ( [ $imageDir, $indexDir ] );
&writeTitle ( \%dates );
foreach my $comic ( sort keys %comics ) {
next if ( $comic =~ m/config/ );
$comics{$comic}{'error'} = &downloadComic ( \%comics, $comic, \%dates );
&writeComic ( \%comics, $comic, \%dates );
}
## &writeMainIndex ( \%dates );
&writeFooter( \%dates );
$DATE=`date`; chomp( $DATE );
print STDOUT "Completed comic fetch at $DATE\n";
## End
#######################################################################
## Function : downloadComic
##
## Description :
## This function determines the download method being used to
## retrieve the comic and calls the apprioriate function.
##
## If the mode is invalid an error will be returned.
##
#######################################################################
sub downloadComic ($$) {
my ( $comics, $comic, $date ) = @_;
SWITCH: {
if ( $comics->{$comic}{'mode'} eq 1 ) {
return indexDownload ( \%comics, $comic, $date );
last SWITCH;
}
if ( $comics->{$comic}{'mode'} eq 2 ) {
return directDownload ( \%comics, $comic, $date );
last SWITCH;
}
}
return "ERROR: Unknown download method specified for $comics->{$comic}{'fullName'}.";
}
#######################################################################
#######################################################################
sub readComicConfig ($$) {
my ( $comicFile ) = @_;
my %comicConfig = ( );
my %config = ( );
open FILEN, "<$comicFile";
while (<FILEN>) {
if ( ( $_ !~ m/^#/ ) && ( $_ =~ m/,.*,/) ){
my @res = split /,/, $_;
$comicConfig{$res[0]}{'url'} = $res[1];
$comicConfig{$res[0]}{'search'} = $res[2];
$comicConfig{$res[0]}{'mode'} = $res[3];
$comicConfig{$res[0]}{'fullName'} = $res[4];
$comicConfig{$res[0]}{'ext'} = $res[5];
$comicConfig{$res[0]}{'error'} = 0;
}
elsif ( $_ =~ m/(.*)\s+=\s+(.*)/ ) {
$comicConfig{'configs'}{$1} = $2;
}
}
close (FILEN);
return %comicConfig;
}
#######################################################################
#######################################################################
sub writeComic ($$) {
my ( $comics, $comic, $date ) = @_;
my $indexFile = $indexDir . "/index-" . $date->{'year2'} . $date->{'mon2'} .
$date->{'day2'} . ".html";
my $content = <<EOF;
<!-- ********* Begin $comic ($comics->{$comic}{'fullName'}) ******* -->
<tr>
<td align="left">
<font color="blue"><b>$comics->{$comic}{'fullName'}</b></font>
<font size="-2">
<a href="$comics->{$comic}{'url'}">
$comics->{$comic}{'url'}
</a>
</font><br/>
<img src="../images/$date->{'mon2'}$date->{'year2'}/$comic-$date->{'day2'}.jpg" alt="$comic-$date->{'day2'}" />
<br/><br/>
</td></tr>
<!-- ********* Finish $comic ($comics->{$comic}{'fullName'}) ******* -->
EOF
open INDEX, ">>$indexFile";
print INDEX $content if ( ! $comics->{$comic}{'error'} );
print INDEX <<EOF
<font color="blue"><b>$comics->{$comic}{'fullName'}</b></font>
<font size="-2"><
<a href="$comics->{$comic}{'url'}">
$comics->{$comic}{'url'}
</a>
</font><br/>
<font color="red"><b>$comic : $comics->{$comic}{'error'}</b></font><br/>
</td>
</tr>
EOF
if ( $comics->{$comic}{'error'} );
close (INDEX);
return 0;
}
#######################################################################
#######################################################################
sub writeMainIndex ($$) {
my ( $date ) = @_;
}
#######################################################################
#######################################################################
sub writeFooter {
my ( $date ) = @_;
my $indexFile = $indexDir . "/index-" . $date->{'year2'} . $date->{'mon2'} .
$date->{'day2'} . ".html";
my $sysDate = `date`;
open INDEX, ">>$indexFile";
print INDEX <<EOF;
</table>
<center>Generated at $sysDate
<p>
<a href="http://validator.w3.org/check?uri=referer"><img
src="http://www.w3.org/Icons/valid-xhtml10-blue" alt="Valid XHTML 1.0 Transitional" height="31" width="88" border="0" /></a>
</p>
</center>
</body>
</html>
EOF
close( INDEX );
}
#######################################################################
#######################################################################
sub checkDir ($$) {
my @dir = @_;
foreach ( @dir ) {
if ( ! -d $_ ) { mkpath( $_ ); }
}
}
#######################################################################
#######################################################################
sub writeTitle ($$) {
my ( $date ) = @_;
my $indexFile = $indexDir . "/index-" . $date->{'year2'} . $date->{'mon2'} .
$date->{'day2'} . ".html";
my $today = $date->{'mon'} . "/" . $date->{'day'} . "/" . $date->{'year'};
open INDEX, ">$indexFile";
print INDEX <<EOF;
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<title>Daily Comics for $today</title>
</head>
<body bgcolor="#FFFFFF">
<h1>Daily Comics for $today</h1>
<table align="center" cellpadding="5" cellspacing="0">
EOF
close (INDEX);
}
#######################################################################
#######################################################################
sub directDownload ($$) {
my ( $comics, $comic, $date ) = @_;
my $file = &parseComic ( $comics, $comic, $date );
##
## Save the file to the appropriate directory
##
my $cDir = $date->{'mon2'} . $date->{'year2'};
my $cDate = $date->{'day2'};
my $cmd = "wget -q $file --referer=\"" . $comics->{$comic}{'url'} ."\" --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert -resize 640 - jpeg:images/$cDir/$comic-$cDate.jpg";
return system($cmd);
}
#######################################################################
#######################################################################
sub indexDownload ($$) {
my ( $comics, $comic, $date ) = @_;
my ( @lines, $comicLine, $mainURL );
my $comicIndex = "indexes/index.$comic";
`wget -q $comics->{$comic}{'url'} -O $comicIndex`;
if ( ! open FILEN, "<$comicIndex" ) {
return "ERROR: Can't open index file for " . $comics->{$comic}{'fullName'} .
" (" . $comics->{$comic}{'url'} . ")";
}
@lines = <FILEN>;
close (FILEN);
unlink ("$comicIndex");
$mainURL = $comics->{$comic}{'url'};
## I need to figure out how to merge these two in to one regex.
$mainURL =~ s/(http:\/\/.*)(?:\/.*\/){1,}.*/$1/;
$mainURL =~ s/([a-z])\/.*/$1/i;
##
## Find the comic strip URL based on the specified regex in the search
##
foreach my $line (@lines) {
if ( $line =~ m/$comics->{$comic}{'search'}/ ) {
$comicLine = $1; chomp $comicLine;
}
}
##
## Save the file to the appropriate directory
##
my $cDir = $date->{'mon2'} . $date->{'year2'};
my $cDate = $date->{'day2'};
if ( $comicLine ) {
if ( $comicLine =~ m/(gif|jpg|png)/i ) { $comics->{$comic}{'ext'} = $1; }
my $comicURL = ( $comicLine =~ m/http/ ) ? $comicLine : $mainURL . $comicLine;
my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer=\"" . $comics->{$comic}{'url'} . "\" -q $comicURL -O - | /usr/bin/convert -resize 640 - jpeg:images/$cDir/$comic-$cDate.jpg";
system( $cmd );
return 0;
}
unlink "index.html";
return "ERROR: Could not download comic $comics->{$comic}{'fullName'}";
}
#######################################################################
#######################################################################
sub parseComic ($$) {
my ( $comics, $comic, $date ) = @_;
my $string = $comics->{$comic}{'search'};
$string =~ s/__year__/$date->{'year'}/g;
$string =~ s/__year2__/$date->{'year2'}/g;
$string =~ s/__mon__/$date->{'mon'}/g;
$string =~ s/__mon2__/$date->{'mon2'}/g;
$string =~ s/__day__/$date->{'day'}/g;
$string =~ s/__day2__/$date->{'day2'}/g;
$string =~ s/__ext__/$comics->{$comic}{'ext'}/g;
chomp $string;
return $string;
}
#######################################################################
#######################################################################
sub fetchDates () {
my %dates = ();
($dates{'day'}, $dates{'mon'}, $dates{'year'}, $dates{'dow'}) = (localtime)[3,4,5,6];
## If you missed a day or two, reflect it here:
# $dates{'day'}-=1; ## <-- 5 days ago
$dates{'year'} += 1900;
$dates{'year2'} = substr $dates{'year'}, 2, 2;
$dates{'day2'} = ( $dates{'day'} < 10 ) ? "0" . $dates{'day'} : $dates{'day'};
$dates{'mon'}++;
$dates{'mon2'} = ( $dates{'mon'} < 10 ) ? "0".$dates{'mon'} : $dates{'mon'};
return %dates;
}
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>