Annotation of comics/fetch.pl.new, revision 1.22

1.1       nick        1: #!/usr/bin/perl -w
                      2: 
1.15      nick        3: ###############################################################################
1.16      nick        4: # $Log: fetch.pl.new,v $
1.22    ! nick        5: # Revision 1.21  2015/10/26 14:25:40  nick
        !             6: # Fixed a bug that was improperly including the day of week string preventing the weekend comics from fetching proproperly.
        !             7: #
1.21      nick        8: # Revision 1.20  2015/10/22 12:58:44  nick
                      9: # Added the ability for Sunday only comics.  Stonesoup is no longer weekdays, this has been added to Sunday only.  I also added Foxtrot Classics for weekdays and Foxtrot for Sundays.
                     10: #
1.20      nick       11: # Revision 1.19  2015/07/13 12:56:58  nick
                     12: # Added Sally Forth and Pearls Before Swine.  Adding Sally Forth required a change in the 'wget' command for fetching the index file to include 'user-agent' and 'referer'.
                     13: #
1.19      nick       14: # Revision 1.18  2015/05/07 12:31:43  nick
                     15: # Added favicon
                     16: #
1.18      nick       17: # Revision 1.17  2015/02/19 14:56:10  nick
                     18: # Fixed a problem that forced everything to JPG.  This would kill GIF animations, but would not display the gifs either because 'convert' appends an index number to the end of the file name for each from of the GIF animation.  I fixed this to maintain GIF compatibilty as well as rewritting how the script fetches the size of the file.  Additionally, I updated the configuration for Questionable Content to search for GIF or JPG, which is what triggered this entire update.
                     19: #
1.17      nick       20: # Revision 1.16  2015/02/05 18:05:58  nick
                     21: # Changed the background and added a fancy title.
                     22: #
1.16      nick       23: # Revision 1.15  2015/01/19 13:46:19  nick
                     24: # *** empty log message ***
                     25: #
1.15      nick       26: ###############################################################################
                     27: 
1.1       nick       28: use strict;
                     29: use File::Path;
                     30: use Data::Dumper;
1.8       nick       31: use Pod::Usage;
                     32: use Getopt::Long;
1.1       nick       33: 
1.21      nick       34: use Date::Calc qw/Date_to_Text_Long Today Day_of_Week Day_of_Week_to_Text/;
1.16      nick       35: 
1.1       nick       36: ## 
                     37: ## Some default values
                     38: ##
1.22    ! nick       39: my $ver                = '$Id: fetch.pl.new,v 1.21 2015/10/26 14:25:40 nick Exp $';
1.1       nick       40: my $comicFile   = "comics.conf";
1.22    ! nick       41: my $comicConfigVer = "Unknown";
1.1       nick       42: my %comics     = &readComicConfig ( $comicFile );
1.8       nick       43: my %opts        = &fetchOptions( );
                     44: my $days_ago    = $opts{'days'} || 0;
1.1       nick       45: my %dates      = &fetchDates();
                     46: my $baseDir     = $comics{'configs'}{'base_directory'} || ".";
                     47: my $imageDir    = $baseDir . "/" . ( $comics{'configs'}{'image_directory'} || "images" ) . 
                     48:                  "/$dates{'mon2'}$dates{'year2'}";
                     49: my $indexDir    = $baseDir . "/" . ( $comics{'configs'}{'index_directory'} || "indexes" );
1.2       nick       50: my $USER_AGENT  = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18) Gecko/20110628 Ubuntu/10.10 (maverick) Firefox/3.6.18";
1.8       nick       51: my @days        = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
1.1       nick       52: 
                     53: my $DATE=`date`; chomp $DATE;
                     54: print STDOUT "Starting comic fetch at $DATE\n";
                     55: 
                     56: ##
                     57: ## Main program starts here
                     58: ##
                     59: &checkDir ( [ $imageDir, $indexDir ] );
                     60: 
1.5       nick       61: &writeTitle ( \%dates );
1.1       nick       62: 
                     63: foreach my $comic ( sort keys %comics ) {
1.20      nick       64: 
                     65:   ## Skip if this is Sunday and the comic is weekdays only
1.1       nick       66:   next if ( $comic =~ m/config/ );
1.21      nick       67:   if (($dates{'wday'} eq "Sunday") && 
1.20      nick       68:       ($comics{$comic}{'sunday'} == 0)) {
                     69:     print "Skipping '$comic'; Weekdays only.\n";
                     70:     next;
                     71:   }
                     72: 
                     73:   ## Skip if Sunday only comic and it's not Sunday.
1.21      nick       74:   if (($dates{'wday'} ne "Sunday") &&
1.20      nick       75:       ($comics{$comic}{'sunday_only'} == 1)) {
1.21      nick       76:     print "Skipping '$comic' ($comics{$comic}{'sunday_only'}); Sunday only.\n";
1.20      nick       77:     next
                     78:   }
                     79: 
1.1       nick       80:   $comics{$comic}{'error'} = &downloadComic ( \%comics, $comic, \%dates );
                     81:   &writeComic ( \%comics, $comic, \%dates );
                     82: 
1.17      nick       83:     my $file = "$imageDir/$comic-$dates{'day2'}.$comics{$comic}{'ext'}";
                     84:     my $size = 0;    
                     85: 
                     86:     my $cmd = "/usr/bin/identify -verbose $file|";
                     87:     open(IMG, $cmd) || die ("Can't open: $!\n");
                     88:     while(<IMG>) {
                     89:         if ($_ =~ m/^\s+geometry:\s+(\d+)x\d+.*/i) {
                     90:             $size = $1 if ( $size == 0);
                     91:         }
                     92:     }
                     93:     close(IMG);
1.4       nick       94: 
1.19      nick       95: 
1.4       nick       96:        system( "/usr/bin/convert -resize 640 $file $file" )
                     97:                if ( $size > 640 ) 
                     98: }
                     99: 
1.1       nick      100: ## &writeMainIndex ( \%dates );
                    101: 
                    102: &writeFooter( \%dates );
                    103: 
                    104: $DATE=`date`;  chomp( $DATE );
                    105: print STDOUT "Completed comic fetch at $DATE\n";
                    106: 
                    107: ## End
                    108: 
                    109: #######################################################################
                    110: ## Function :  downloadComic
                    111: ##
                    112: ##   Description :
                    113: ##     This function determines the download method being used to 
                    114: ##      retrieve the comic and calls the apprioriate function.
                    115: ##
                    116: ##      If the mode is invalid an error will be returned.
                    117: ##
                    118: #######################################################################
                    119: sub downloadComic ($$) {
                    120:        my ( $comics, $comic, $date ) = @_;
                    121: 
                    122:        SWITCH: {
                    123:                if ( $comics->{$comic}{'mode'} eq 1 ) { 
                    124:                        return indexDownload ( \%comics, $comic, $date );
                    125:                        last SWITCH;
                    126:                }
                    127:                if ( $comics->{$comic}{'mode'} eq 2 ) { 
                    128:                        return directDownload ( \%comics, $comic, $date );
                    129:                        last SWITCH;
                    130:                }
                    131:        }
                    132:         
                    133:        return "ERROR: Unknown download method specified for $comics->{$comic}{'fullName'}.";
                    134: }
                    135: 
                    136: #######################################################################
                    137: #######################################################################
                    138: sub readComicConfig ($$) {
                    139:        my ( $comicFile ) = @_;
                    140:        my %comicConfig   = ( );
                    141:        my %config        = ( );
                    142: 
1.14      nick      143:     my ($year, $mon, $day) =( localtime(time))[5,4,3];
                    144:     $year += 1900;
                    145:     $mon = sprintf("%02d", ($mon + 1));
                    146:     $day = sprintf("%02d", $day);
                    147: 
1.1       nick      148:        open FILEN, "<$comicFile";
                    149:                while (<FILEN>) {
1.22    ! nick      150:             if ($_ =~ m/^#.* \$Id: (.*)\$/) {
        !           151:                 $comicConfigVer = $1;
        !           152:             }
1.1       nick      153:                        if ( ( $_ !~ m/^#/ ) && ( $_ =~ m/,.*,/) ){
1.14      nick      154:                 $_ =~ s/__YEAR__/$year/g;
                    155:                 $_ =~ s/__MON__/$mon/g;
                    156:                 $_ =~ s/__DAY__/$day/g;
                    157:                 
1.1       nick      158:                                my @res = split /,/, $_;
                    159:                                $comicConfig{$res[0]}{'url'}      = $res[1];
                    160:                                $comicConfig{$res[0]}{'search'}   = $res[2];
                    161:                                $comicConfig{$res[0]}{'mode'}     = $res[3];
                    162:                                $comicConfig{$res[0]}{'fullName'} = $res[4];
                    163:                                $comicConfig{$res[0]}{'ext'}      = $res[5];
1.21      nick      164:                 $comicConfig{$res[0]}{'sunday'}   = sprintf("%d", $res[6] || 1);
                    165:                 $comicConfig{$res[0]}{'sunday_only'} = sprintf("%d", $res[7] || 0);
1.1       nick      166:                                $comicConfig{$res[0]}{'error'}    = 0;
                    167:                        }
                    168:                        elsif ( $_ =~ m/(.*)\s+=\s+(.*)/ ) {
                    169:                                $comicConfig{'configs'}{$1} = $2;
                    170:                        }
                    171:                }
                    172:        close (FILEN);
                    173: 
                    174:        return %comicConfig;
                    175: }
                    176: 
                    177: #######################################################################
                    178: #######################################################################
                    179: sub writeComic ($$) {
                    180:        my ( $comics, $comic, $date ) = @_;
1.11      nick      181:        my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12      nick      182:        my $indexFile = $indexDir . "/index-" . $date->{'year2'} . 
                    183:                        $date->{'mon2'} . $date->{'day2'} . "-" . 
                    184:                        $sd . ".html";
1.1       nick      185:        my $content = <<EOF;
                    186: 
                    187: <!-- ********* Begin $comic ($comics->{$comic}{'fullName'}) ******* -->
                    188:   <tr>
                    189:     <td align="left">
                    190: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font> &nbsp; &nbsp; 
                    191: <font size="-2">
                    192:        <a href="$comics->{$comic}{'url'}">
                    193:                $comics->{$comic}{'url'}
                    194:        </a>
                    195: </font><br/>
1.17      nick      196: <img src="../images/$date->{'mon2'}$date->{'year2'}/$comic-$date->{'day2'}.$comics->{$comic}{'ext'}" alt="$comic-$date->{'day2'}" />
1.1       nick      197: <br/><br/>
                    198: </td></tr>
                    199: <!-- ********* Finish $comic ($comics->{$comic}{'fullName'}) ******* -->
                    200: 
                    201: EOF
                    202:        open INDEX, ">>$indexFile";
                    203: 
                    204:        print INDEX $content if ( ! $comics->{$comic}{'error'} );
                    205: 
                    206:        print INDEX <<EOF
                    207: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font> &nbsp; &nbsp;
                    208: <font size="-2"><
                    209:         <a href="$comics->{$comic}{'url'}">
                    210:                 $comics->{$comic}{'url'}
                    211:         </a>
                    212: </font><br/>
                    213: <font color="red"><b>$comic :  $comics->{$comic}{'error'}</b></font><br/>
                    214:   </td>
                    215: </tr>
                    216: EOF
                    217:                if ( $comics->{$comic}{'error'} );
                    218: 
                    219:        close (INDEX);
                    220: 
                    221:        return 0;
                    222: }
                    223: 
                    224: 
                    225: #######################################################################
                    226: #######################################################################
                    227: sub writeMainIndex ($$) {
                    228:        my ( $date ) = @_;
                    229: 
                    230: }
                    231: 
                    232: 
                    233: #######################################################################
                    234: #######################################################################
                    235: sub writeFooter {
                    236:        my ( $date ) = @_;
1.11      nick      237:        my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12      nick      238:        my $indexFile = $indexDir . "/index-" . $date->{'year2'} . 
                    239:                        $date->{'mon2'} . $date->{'day2'} . "-" . 
                    240:                        $sd . ".html";
1.1       nick      241:        my $sysDate = `date`;
                    242: 
                    243:        open INDEX, ">>$indexFile";
                    244:        print INDEX <<EOF;
                    245: </table>
1.3       nick      246: <center>
                    247: <font size="2">
                    248: Generated on: <font color="green">$sysDate</font><br/>
1.7       nick      249: Version: <font color="green">$ver</font><br />
1.22    ! nick      250: Config Version: <font color="green">$comicConfigVer</font><br />
1.7       nick      251: CVS: <a href="http://demandred.dyndns.org/cgi-bin/cvsweb/comics/">http://demandred.dyndns.org/cgi-bin/cvsweb/comics/</a>
1.1       nick      252:   <p>
                    253:     <a href="http://validator.w3.org/check?uri=referer"><img
                    254:       src="http://www.w3.org/Icons/valid-xhtml10-blue" alt="Valid XHTML 1.0 Transitional" height="31" width="88" border="0" /></a>
                    255:   </p>
                    256: </center>
                    257: 
                    258: </body>
                    259: </html>
                    260: EOF
                    261:        close( INDEX );
                    262: }
                    263: 
                    264: #######################################################################
                    265: #######################################################################
                    266: sub checkDir ($$) {
                    267:        my @dir = @_;
                    268: 
                    269:        foreach ( @dir ) {
                    270:                if ( ! -d $_ ) { mkpath( $_ ); }
                    271:        }
                    272: }
                    273: 
                    274: #######################################################################
                    275: #######################################################################
                    276: sub writeTitle ($$) {
                    277:        my ( $date ) = @_;
1.11      nick      278:        my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12      nick      279:        my $indexFile = $indexDir . "/index-" . $date->{'year2'} . 
                    280:                        $date->{'mon2'} . $date->{'day2'} . "-" . 
                    281:                        $sd . ".html";
1.8       nick      282:        my $today     = $days[$date->{'dow'}] . " " . $date->{'mon'} . "/" . $date->{'day'} . "/" . $date->{'year'};
1.16      nick      283:     my $today_long = Date_to_Text_Long(Today());
1.1       nick      284: 
                    285:        open INDEX, ">$indexFile";
                    286:        print INDEX <<EOF;
                    287: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
                    288: 
                    289: <html xmlns="http://www.w3.org/1999/xhtml">
                    290: <head>
                    291: <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
1.13      nick      292: <link href="/daily/comics/comics.css" type="text/css" rel="stylesheet" media="screen">
1.18      nick      293: <link rel="shortcut icon" href="./favicon.ico">
1.1       nick      294:     <title>Daily Comics for $today</title>
                    295:   </head>
                    296: <body bgcolor="#FFFFFF">
                    297: <table align="center" cellpadding="5" cellspacing="0">
1.16      nick      298: <tr><td>
                    299: <table cellpadding="0" cellspacing="0" border="0">
                    300: <tr><td align="Left"><img src="images/daily_comics_heading01.png"></td></tr>
                    301: <tr><td align="left">$today_long</td></tr>
                    302: <tr><td>&nbsp;</td></tr>
                    303: </td</tr>
                    304: 
1.1       nick      305: EOF
                    306:        close (INDEX);
                    307: }
                    308: 
                    309: #######################################################################
                    310: #######################################################################
                    311: sub directDownload ($$) {
                    312:        my ( $comics, $comic, $date ) = @_;
                    313:        my $file = &parseComic ( $comics, $comic, $date );
                    314: 
                    315:         ##
                    316:         ## Save the file to the appropriate directory
                    317:         ##
                    318:         my $cDir  = $date->{'mon2'} . $date->{'year2'};
                    319:         my $cDate = $date->{'day2'};
                    320: 
1.18      nick      321:        my $cmd = "wget -q $file --referer=\"" . $comics->{$comic}{'url'} ."\" --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg";
1.14      nick      322: 
1.1       nick      323:         return system($cmd);
                    324: }
                    325: 
                    326: #######################################################################
                    327: #######################################################################
                    328: sub indexDownload ($$) {
                    329:        my ( $comics, $comic, $date ) = @_;
                    330:        my ( @lines, $comicLine, $mainURL );
                    331:        my $comicIndex = "indexes/index.$comic";
                    332: 
1.19      nick      333:     my $wget_cmd = "wget -q --referer=\"$comics->{$comic}{'url'}\" " .
                    334:                    "--user-agent=\"$USER_AGENT\" " .
                    335:                    "$comics->{$comic}{'url'} -O $comicIndex";
                    336:     system($wget_cmd);
1.1       nick      337: 
                    338:        if ( ! open FILEN, "<$comicIndex" ) {  
                    339:                return "ERROR: Can't open index file for " . $comics->{$comic}{'fullName'} . 
                    340:                       " (" . $comics->{$comic}{'url'} . ")"; 
                    341:        } 
                    342:                @lines = <FILEN>;
                    343:        close (FILEN);  
                    344: 
                    345:        unlink ("$comicIndex");
                    346: 
                    347:        $mainURL = $comics->{$comic}{'url'};
                    348:        ## I need to figure out how to merge these two in to one regex.
                    349:        $mainURL =~ s/(http:\/\/.*)(?:\/.*\/){1,}.*/$1/;
                    350:        $mainURL =~ s/([a-z])\/.*/$1/i;
                    351: 
                    352:        ##
                    353:        ## Find the comic strip URL based on the specified regex in the search
                    354:        ##
                    355:        foreach my $line (@lines) {
1.17      nick      356:                if ( $line =~ m/$comics->{$comic}{'search'}/i ) {
1.1       nick      357:                        $comicLine = $1; chomp $comicLine;
                    358:                }
1.17      nick      359:     }
1.1       nick      360: 
                    361:        ##
                    362:        ## Save the file to the appropriate directory
                    363:        ##
                    364:        my $cDir    = $date->{'mon2'} . $date->{'year2'};
                    365:        my $cDate   = $date->{'day2'};
                    366: 
                    367:        if ( $comicLine ) {
                    368:                if ( $comicLine =~ m/(gif|jpg|png)/i ) { $comics->{$comic}{'ext'} = $1; }
                    369:                my $comicURL = ( $comicLine =~ m/http/ ) ? $comicLine : $mainURL . $comicLine;
1.17      nick      370:                my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer=\"" . $comics->{$comic}{'url'} . "\" -q $comicURL -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}";
1.1       nick      371:                system( $cmd );
                    372:                return 0;
                    373:        }
                    374: 
                    375:        unlink "index.html";
                    376: 
                    377:        return "ERROR: Could not download comic $comics->{$comic}{'fullName'}";
                    378: }
                    379: 
                    380: #######################################################################
                    381: #######################################################################
                    382: sub parseComic ($$) {
                    383:        my ( $comics, $comic, $date ) = @_;
                    384:        my $string = $comics->{$comic}{'search'};
                    385: 
                    386:        $string =~ s/__year__/$date->{'year'}/g;
                    387:        $string =~ s/__year2__/$date->{'year2'}/g;
                    388:        $string =~ s/__mon__/$date->{'mon'}/g;
                    389:        $string =~ s/__mon2__/$date->{'mon2'}/g;
                    390:        $string =~ s/__day__/$date->{'day'}/g;
                    391:        $string =~ s/__day2__/$date->{'day2'}/g;
                    392:        $string =~ s/__ext__/$comics->{$comic}{'ext'}/g;
                    393:        chomp $string;
                    394: 
                    395:        return $string;
                    396: }
                    397: 
                    398: #######################################################################
                    399: #######################################################################
                    400: sub fetchDates () {
                    401:        my %dates = ();
                    402: 
1.8       nick      403:        ($dates{'day'}, $dates{'mon'}, $dates{'year'}, $dates{'dow'}) = (localtime(time - (86400 * $days_ago )))[3,4,5,6];
1.1       nick      404: 
                    405:        $dates{'year'} += 1900;
                    406:        $dates{'year2'} = substr $dates{'year'}, 2, 2;
                    407:        $dates{'day2'}  = ( $dates{'day'} < 10 ) ? "0" . $dates{'day'} : $dates{'day'}; 
                    408:        $dates{'mon'}++;
                    409:        $dates{'mon2'}  = ( $dates{'mon'} < 10 ) ? "0".$dates{'mon'} : $dates{'mon'};
1.21      nick      410:     my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
                    411:     $dates{'wday'} = $days[$dates{'dow'}];
1.1       nick      412: 
                    413:        return %dates;
                    414: }
1.8       nick      415: 
                    416: ###############################################################################
                    417: ##
                    418: ## &fetchOptions( );
                    419: ##
                    420: ##      Grab our command line arguments and toss them in to a hash
                    421: ##
                    422: ###############################################################################
                    423: sub fetchOptions {
                    424:         my %opts;
                    425: 
                    426:         &GetOptions(
                    427:                         "days:i"        => \$opts{'days'},
                    428:                         "help|?"        => \$opts{'help'},
                    429:                         "man"           => \$opts{'man'},
                    430:                    ) || &pod2usage( );
                    431:         &pod2usage( ) if defined $opts{'help'};
                    432:         &pod2usage( { -verbose => 2, -input => \*DATA } ) if defined $opts{'man'};
                    433: 
                    434:         return %opts;
                    435: }
                    436: 
                    437: __END__
                    438: 
                    439: =head1 NAME
                    440: 
                    441: fetch.pl - Fetches comics and places them all locally in a single html file.
                    442: 
                    443: =head1 SYNOPSIS
                    444: 
                    445: fetch.pl [options]
                    446: 
                    447: Options:
                    448:         --days,d        Fetch comics from X days ago
                    449:         --help,?        Display the basic help menu
                    450:         --man,m         Display the detailed man page
                    451: 
                    452: =head1 DESCRIPTION
                    453: 
                    454: =head1 HISTORY
                    455: 
                    456: =head1 AUTHOR
                    457: 
                    458: Nicholas DeClario <nick@declario.com>
                    459: 
                    460: =head1 BUGS
                    461: 
                    462: This is a work in progress.  Please report all bugs to the author.
                    463: 
                    464: =head1 SEE ALSO
                    465: 
                    466: =head1 COPYRIGHT
                    467: 
                    468: =cut
                    469: 
                    470: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>