comics/fetch.pl.new - view

File: [Local Repository] / comics / fetch.pl.new
Revision 1.8: download - view: text, annotated - select for diffs
Tue Feb 5 14:31:57 2013 UTC (13 years, 2 months ago) by nick
Branches: MAIN
CVS tags: HEAD

Added the string day to the header.  Attempted to add an option '--day X' to download the comics from X days ago.  However, this only works for certain comics, otherwise it just downloads the current day's comic and stores it with the date for the day you want.  Also, this code is still a mess.  I wrote it 10 years ago...  Might be time for an overhaul.

#!/usr/bin/perl -w use strict; use File::Path; use Data::Dumper; use Pod::Usage; use Getopt::Long; ## ## Some default values ## my $ver = q/$Id: fetch.pl.new,v 1.8 2013/02/05 14:31:57 nick Exp $/; my $comicFile = "comics.conf"; my %comics = &readComicConfig ( $comicFile ); my %opts = &fetchOptions( ); my $days_ago = $opts{'days'} || 0; my %dates = &fetchDates(); my $baseDir = $comics{'configs'}{'base_directory'} || "."; my $imageDir = $baseDir . "/" . ( $comics{'configs'}{'image_directory'} || "images" ) . "/$dates{'mon2'}$dates{'year2'}"; my $indexDir = $baseDir . "/" . ( $comics{'configs'}{'index_directory'} || "indexes" ); my $USER_AGENT = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18) Gecko/20110628 Ubuntu/10.10 (maverick) Firefox/3.6.18"; my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /; my $DATE=`date`; chomp $DATE; print STDOUT "Starting comic fetch at $DATE\n"; ## ## Main program starts here ## &checkDir ( [ $imageDir, $indexDir ] ); &writeTitle ( \%dates ); foreach my $comic ( sort keys %comics ) { next if ( $comic =~ m/config/ ); $comics{$comic}{'error'} = &downloadComic ( \%comics, $comic, \%dates ); &writeComic ( \%comics, $comic, \%dates ); } print "Finding in $imageDir/*-$dates{'day2'}.jpg\n"; foreach my $file ( glob( "$imageDir/*-$dates{'day2'}.jpg" ) ) { my $size = `/usr/bin/identify $file`; $size =~ s/.*\s(\d+)x\d+.*/$1/; system( "/usr/bin/convert -resize 640 $file $file" ) if ( $size > 640 ) } ## &writeMainIndex ( \%dates ); &writeFooter( \%dates ); $DATE=`date`; chomp( $DATE ); print STDOUT "Completed comic fetch at $DATE\n"; ## End ####################################################################### ## Function : downloadComic ## ## Description : ## This function determines the download method being used to ## retrieve the comic and calls the apprioriate function. ## ## If the mode is invalid an error will be returned. ## ####################################################################### sub downloadComic ($$) { my ( $comics, $comic, $date ) = @_; SWITCH: { if ( $comics->{$comic}{'mode'} eq 1 ) { return indexDownload ( \%comics, $comic, $date ); last SWITCH; } if ( $comics->{$comic}{'mode'} eq 2 ) { return directDownload ( \%comics, $comic, $date ); last SWITCH; } } return "ERROR: Unknown download method specified for $comics->{$comic}{'fullName'}."; } ####################################################################### ####################################################################### sub readComicConfig ($$) { my ( $comicFile ) = @_; my %comicConfig = ( ); my %config = ( ); open FILEN, "<$comicFile"; while (<FILEN>) { if ( ( $_ !~ m/^#/ ) && ( $_ =~ m/,.*,/) ){ my @res = split /,/, $_; $comicConfig{$res[0]}{'url'} = $res[1]; $comicConfig{$res[0]}{'search'} = $res[2]; $comicConfig{$res[0]}{'mode'} = $res[3]; $comicConfig{$res[0]}{'fullName'} = $res[4]; $comicConfig{$res[0]}{'ext'} = $res[5]; $comicConfig{$res[0]}{'error'} = 0; } elsif ( $_ =~ m/(.*)\s+=\s+(.*)/ ) { $comicConfig{'configs'}{$1} = $2; } } close (FILEN); return %comicConfig; } ####################################################################### ####################################################################### sub writeComic ($$) { my ( $comics, $comic, $date ) = @_; my $indexFile = $indexDir . "/index-" . $date->{'year2'} . $date->{'mon2'} . $date->{'day2'} . ".html"; my $content = <<EOF;  <tr> <td align="left"> $comics->{$comic}{'fullName'}     <a href="$comics->{$comic}{'url'}"> $comics->{$comic}{'url'} </a> <img src="../images/$date->{'mon2'}$date->{'year2'}/$comic-$date->{'day2'}.jpg" alt="$comic-$date->{'day2'}" /> </td></tr>  EOF open INDEX, ">>$indexFile"; print INDEX $content if ( ! $comics->{$comic}{'error'} ); print INDEX <<EOF $comics->{$comic}{'fullName'}     < <a href="$comics->{$comic}{'url'}"> $comics->{$comic}{'url'} </a> $comic : $comics->{$comic}{'error'} </td> </tr> EOF if ( $comics->{$comic}{'error'} ); close (INDEX); return 0; } ####################################################################### ####################################################################### sub writeMainIndex ($$) { my ( $date ) = @_; } ####################################################################### ####################################################################### sub writeFooter { my ( $date ) = @_; my $indexFile = $indexDir . "/index-" . $date->{'year2'} . $date->{'mon2'} . $date->{'day2'} . ".html"; my $sysDate = `date`; open INDEX, ">>$indexFile"; print INDEX <<EOF; </table> <center> Generated on: $sysDate Version: $ver CVS: <a href="http://demandred.dyndns.org/cgi-bin/cvsweb/comics/">http://demandred.dyndns.org/cgi-bin/cvsweb/comics/</a> <a href="http://validator.w3.org/check?uri=referer"><img src="http://www.w3.org/Icons/valid-xhtml10-blue" alt="Valid XHTML 1.0 Transitional" height="31" width="88" border="0" /></a> </center> </body> </html> EOF close( INDEX ); } ####################################################################### ####################################################################### sub checkDir ($$) { my @dir = @_; foreach ( @dir ) { if ( ! -d $_ ) { mkpath( $_ ); } } } ####################################################################### ####################################################################### sub writeTitle ($$) { my ( $date ) = @_; my $indexFile = $indexDir . "/index-" . $date->{'year2'} . $date->{'mon2'} . $date->{'day2'} . ".html"; my $today = $days[$date->{'dow'}] . " " . $date->{'mon'} . "/" . $date->{'day'} . "/" . $date->{'year'}; open INDEX, ">$indexFile"; print INDEX <<EOF; <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html;charset=utf-8" /> <title>Daily Comics for $today</title> </head> <body bgcolor="#FFFFFF"> <h1>Daily Comics for $today</h1> <table align="center" cellpadding="5" cellspacing="0"> EOF close (INDEX); } ####################################################################### ####################################################################### sub directDownload ($$) { my ( $comics, $comic, $date ) = @_; my $file = &parseComic ( $comics, $comic, $date ); ## ## Save the file to the appropriate directory ## my $cDir = $date->{'mon2'} . $date->{'year2'}; my $cDate = $date->{'day2'}; my $cmd = "wget -q $file --referer=\"" . $comics->{$comic}{'url'} ."\" --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg"; return system($cmd); } ####################################################################### ####################################################################### sub indexDownload ($$) { my ( $comics, $comic, $date ) = @_; my ( @lines, $comicLine, $mainURL ); my $comicIndex = "indexes/index.$comic"; `wget -q $comics->{$comic}{'url'} -O $comicIndex`; if ( ! open FILEN, "<$comicIndex" ) { return "ERROR: Can't open index file for " . $comics->{$comic}{'fullName'} . " (" . $comics->{$comic}{'url'} . ")"; } @lines = <FILEN>; close (FILEN); unlink ("$comicIndex"); $mainURL = $comics->{$comic}{'url'}; ## I need to figure out how to merge these two in to one regex. $mainURL =~ s/(http:\/\/.*)(?:\/.*\/){1,}.*/$1/; $mainURL =~ s/([a-z])\/.*/$1/i; ## ## Find the comic strip URL based on the specified regex in the search ## foreach my $line (@lines) { if ( $line =~ m/$comics->{$comic}{'search'}/ ) { $comicLine = $1; chomp $comicLine; } } ## ## Save the file to the appropriate directory ## my $cDir = $date->{'mon2'} . $date->{'year2'}; my $cDate = $date->{'day2'}; if ( $comicLine ) { if ( $comicLine =~ m/(gif|jpg|png)/i ) { $comics->{$comic}{'ext'} = $1; } my $comicURL = ( $comicLine =~ m/http/ ) ? $comicLine : $mainURL . $comicLine; my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer=\"" . $comics->{$comic}{'url'} . "\" -q $comicURL -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg"; system( $cmd ); return 0; } unlink "index.html"; return "ERROR: Could not download comic $comics->{$comic}{'fullName'}"; } ####################################################################### ####################################################################### sub parseComic ($$) { my ( $comics, $comic, $date ) = @_; my $string = $comics->{$comic}{'search'}; $string =~ s/__year__/$date->{'year'}/g; $string =~ s/__year2__/$date->{'year2'}/g; $string =~ s/__mon__/$date->{'mon'}/g; $string =~ s/__mon2__/$date->{'mon2'}/g; $string =~ s/__day__/$date->{'day'}/g; $string =~ s/__day2__/$date->{'day2'}/g; $string =~ s/__ext__/$comics->{$comic}{'ext'}/g; chomp $string; return $string; } ####################################################################### ####################################################################### sub fetchDates () { my %dates = (); ($dates{'day'}, $dates{'mon'}, $dates{'year'}, $dates{'dow'}) = (localtime(time - (86400 * $days_ago )))[3,4,5,6]; $dates{'year'} += 1900; $dates{'year2'} = substr $dates{'year'}, 2, 2; $dates{'day2'} = ( $dates{'day'} < 10 ) ? "0" . $dates{'day'} : $dates{'day'}; $dates{'mon'}++; $dates{'mon2'} = ( $dates{'mon'} < 10 ) ? "0".$dates{'mon'} : $dates{'mon'}; return %dates; } ############################################################################### ## ## &fetchOptions( ); ## ## Grab our command line arguments and toss them in to a hash ## ############################################################################### sub fetchOptions { my %opts; &GetOptions( "days:i" => \$opts{'days'}, "help|?" => \$opts{'help'}, "man" => \$opts{'man'}, ) || &pod2usage( ); &pod2usage( ) if defined $opts{'help'}; &pod2usage( { -verbose => 2, -input => \*DATA } ) if defined $opts{'man'}; return %opts; } __END__ =head1 NAME fetch.pl - Fetches comics and places them all locally in a single html file. =head1 SYNOPSIS fetch.pl [options] Options: --days,d Fetch comics from X days ago --help,? Display the basic help menu --man,m Display the detailed man page =head1 DESCRIPTION =head1 HISTORY =head1 AUTHOR Nicholas DeClario <nick@declario.com> =head1 BUGS This is a work in progress. Please report all bugs to the author. =head1 SEE ALSO =head1 COPYRIGHT =cut