Annotation of comics/fetch.pl.new, revision 1.23

1.1       nick        1: #!/usr/bin/perl -w
                      2: 
1.15      nick        3: ###############################################################################
1.16      nick        4: # $Log: fetch.pl.new,v $
1.23    ! nick        5: # Revision 1.22  2017/12/05 13:37:40  nick
        !             6: # Added the CVS config version to the outpuit.
        !             7: #
1.22      nick        8: # Revision 1.21  2015/10/26 14:25:40  nick
                      9: # Fixed a bug that was improperly including the day of week string preventing the weekend comics from fetching proproperly.
                     10: #
1.21      nick       11: # Revision 1.20  2015/10/22 12:58:44  nick
                     12: # Added the ability for Sunday only comics.  Stonesoup is no longer weekdays, this has been added to Sunday only.  I also added Foxtrot Classics for weekdays and Foxtrot for Sundays.
                     13: #
1.20      nick       14: # Revision 1.19  2015/07/13 12:56:58  nick
                     15: # Added Sally Forth and Pearls Before Swine.  Adding Sally Forth required a change in the 'wget' command for fetching the index file to include 'user-agent' and 'referer'.
                     16: #
1.19      nick       17: # Revision 1.18  2015/05/07 12:31:43  nick
                     18: # Added favicon
                     19: #
1.18      nick       20: # Revision 1.17  2015/02/19 14:56:10  nick
                     21: # Fixed a problem that forced everything to JPG.  This would kill GIF animations, but would not display the gifs either because 'convert' appends an index number to the end of the file name for each from of the GIF animation.  I fixed this to maintain GIF compatibilty as well as rewritting how the script fetches the size of the file.  Additionally, I updated the configuration for Questionable Content to search for GIF or JPG, which is what triggered this entire update.
                     22: #
1.17      nick       23: # Revision 1.16  2015/02/05 18:05:58  nick
                     24: # Changed the background and added a fancy title.
                     25: #
1.16      nick       26: # Revision 1.15  2015/01/19 13:46:19  nick
                     27: # *** empty log message ***
                     28: #
1.15      nick       29: ###############################################################################
                     30: 
1.1       nick       31: use strict;
                     32: use File::Path;
                     33: use Data::Dumper;
1.8       nick       34: use Pod::Usage;
                     35: use Getopt::Long;
1.1       nick       36: 
1.21      nick       37: use Date::Calc qw/Date_to_Text_Long Today Day_of_Week Day_of_Week_to_Text/;
1.16      nick       38: 
1.1       nick       39: ## 
                     40: ## Some default values
                     41: ##
1.23    ! nick       42: my $ver                = '$Id: fetch.pl.new,v 1.22 2017/12/05 13:37:40 nick Exp $';
1.1       nick       43: my $comicFile   = "comics.conf";
1.22      nick       44: my $comicConfigVer = "Unknown";
1.1       nick       45: my %comics     = &readComicConfig ( $comicFile );
1.8       nick       46: my %opts        = &fetchOptions( );
                     47: my $days_ago    = $opts{'days'} || 0;
1.1       nick       48: my %dates      = &fetchDates();
                     49: my $baseDir     = $comics{'configs'}{'base_directory'} || ".";
                     50: my $imageDir    = $baseDir . "/" . ( $comics{'configs'}{'image_directory'} || "images" ) . 
                     51:                  "/$dates{'mon2'}$dates{'year2'}";
                     52: my $indexDir    = $baseDir . "/" . ( $comics{'configs'}{'index_directory'} || "indexes" );
1.2       nick       53: my $USER_AGENT  = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18) Gecko/20110628 Ubuntu/10.10 (maverick) Firefox/3.6.18";
1.8       nick       54: my @days        = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
1.1       nick       55: 
                     56: my $DATE=`date`; chomp $DATE;
                     57: print STDOUT "Starting comic fetch at $DATE\n";
                     58: 
                     59: ##
                     60: ## Main program starts here
                     61: ##
                     62: &checkDir ( [ $imageDir, $indexDir ] );
                     63: 
1.5       nick       64: &writeTitle ( \%dates );
1.1       nick       65: 
                     66: foreach my $comic ( sort keys %comics ) {
1.20      nick       67: 
                     68:   ## Skip if this is Sunday and the comic is weekdays only
1.1       nick       69:   next if ( $comic =~ m/config/ );
1.21      nick       70:   if (($dates{'wday'} eq "Sunday") && 
1.20      nick       71:       ($comics{$comic}{'sunday'} == 0)) {
                     72:     print "Skipping '$comic'; Weekdays only.\n";
                     73:     next;
                     74:   }
                     75: 
                     76:   ## Skip if Sunday only comic and it's not Sunday.
1.21      nick       77:   if (($dates{'wday'} ne "Sunday") &&
1.20      nick       78:       ($comics{$comic}{'sunday_only'} == 1)) {
1.21      nick       79:     print "Skipping '$comic' ($comics{$comic}{'sunday_only'}); Sunday only.\n";
1.20      nick       80:     next
                     81:   }
                     82: 
1.1       nick       83:   $comics{$comic}{'error'} = &downloadComic ( \%comics, $comic, \%dates );
                     84:   &writeComic ( \%comics, $comic, \%dates );
                     85: 
1.17      nick       86:     my $file = "$imageDir/$comic-$dates{'day2'}.$comics{$comic}{'ext'}";
                     87:     my $size = 0;    
                     88: 
                     89:     my $cmd = "/usr/bin/identify -verbose $file|";
                     90:     open(IMG, $cmd) || die ("Can't open: $!\n");
                     91:     while(<IMG>) {
                     92:         if ($_ =~ m/^\s+geometry:\s+(\d+)x\d+.*/i) {
                     93:             $size = $1 if ( $size == 0);
                     94:         }
                     95:     }
                     96:     close(IMG);
1.4       nick       97: 
1.19      nick       98: 
1.4       nick       99:        system( "/usr/bin/convert -resize 640 $file $file" )
                    100:                if ( $size > 640 ) 
                    101: }
                    102: 
1.1       nick      103: ## &writeMainIndex ( \%dates );
                    104: 
                    105: &writeFooter( \%dates );
                    106: 
                    107: $DATE=`date`;  chomp( $DATE );
                    108: print STDOUT "Completed comic fetch at $DATE\n";
                    109: 
                    110: ## End
                    111: 
                    112: #######################################################################
                    113: ## Function :  downloadComic
                    114: ##
                    115: ##   Description :
                    116: ##     This function determines the download method being used to 
                    117: ##      retrieve the comic and calls the apprioriate function.
                    118: ##
                    119: ##      If the mode is invalid an error will be returned.
                    120: ##
                    121: #######################################################################
                    122: sub downloadComic ($$) {
                    123:        my ( $comics, $comic, $date ) = @_;
                    124: 
                    125:        SWITCH: {
                    126:                if ( $comics->{$comic}{'mode'} eq 1 ) { 
                    127:                        return indexDownload ( \%comics, $comic, $date );
                    128:                        last SWITCH;
                    129:                }
                    130:                if ( $comics->{$comic}{'mode'} eq 2 ) { 
                    131:                        return directDownload ( \%comics, $comic, $date );
                    132:                        last SWITCH;
                    133:                }
                    134:        }
                    135:         
                    136:        return "ERROR: Unknown download method specified for $comics->{$comic}{'fullName'}.";
                    137: }
                    138: 
                    139: #######################################################################
                    140: #######################################################################
                    141: sub readComicConfig ($$) {
                    142:        my ( $comicFile ) = @_;
                    143:        my %comicConfig   = ( );
                    144:        my %config        = ( );
                    145: 
1.14      nick      146:     my ($year, $mon, $day) =( localtime(time))[5,4,3];
                    147:     $year += 1900;
                    148:     $mon = sprintf("%02d", ($mon + 1));
                    149:     $day = sprintf("%02d", $day);
                    150: 
1.1       nick      151:        open FILEN, "<$comicFile";
                    152:                while (<FILEN>) {
1.23    ! nick      153:             #if ($_ =~ m/^#.* \$Id: fetch.pl.new,v 1.22 2017/12/05 13:37:40 nick Exp $/) {
        !           154:             if ($_ =~ m/^#.* \$Id:\ (.*)Exp \$$/) {
1.22      nick      155:                 $comicConfigVer = $1;
                    156:             }
1.1       nick      157:                        if ( ( $_ !~ m/^#/ ) && ( $_ =~ m/,.*,/) ){
1.14      nick      158:                 $_ =~ s/__YEAR__/$year/g;
                    159:                 $_ =~ s/__MON__/$mon/g;
                    160:                 $_ =~ s/__DAY__/$day/g;
                    161:                 
1.1       nick      162:                                my @res = split /,/, $_;
                    163:                                $comicConfig{$res[0]}{'url'}      = $res[1];
                    164:                                $comicConfig{$res[0]}{'search'}   = $res[2];
                    165:                                $comicConfig{$res[0]}{'mode'}     = $res[3];
                    166:                                $comicConfig{$res[0]}{'fullName'} = $res[4];
                    167:                                $comicConfig{$res[0]}{'ext'}      = $res[5];
1.21      nick      168:                 $comicConfig{$res[0]}{'sunday'}   = sprintf("%d", $res[6] || 1);
                    169:                 $comicConfig{$res[0]}{'sunday_only'} = sprintf("%d", $res[7] || 0);
1.23    ! nick      170:                 $comicConfig{$res[0]}{'remove_newlines'} = sprintf("%d", $res[8] || 0);
1.1       nick      171:                                $comicConfig{$res[0]}{'error'}    = 0;
                    172:                        }
                    173:                        elsif ( $_ =~ m/(.*)\s+=\s+(.*)/ ) {
                    174:                                $comicConfig{'configs'}{$1} = $2;
                    175:                        }
                    176:                }
                    177:        close (FILEN);
                    178: 
                    179:        return %comicConfig;
                    180: }
                    181: 
                    182: #######################################################################
                    183: #######################################################################
                    184: sub writeComic ($$) {
                    185:        my ( $comics, $comic, $date ) = @_;
1.11      nick      186:        my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12      nick      187:        my $indexFile = $indexDir . "/index-" . $date->{'year2'} . 
                    188:                        $date->{'mon2'} . $date->{'day2'} . "-" . 
                    189:                        $sd . ".html";
1.1       nick      190:        my $content = <<EOF;
                    191: 
                    192: <!-- ********* Begin $comic ($comics->{$comic}{'fullName'}) ******* -->
                    193:   <tr>
                    194:     <td align="left">
                    195: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font> &nbsp; &nbsp; 
                    196: <font size="-2">
                    197:        <a href="$comics->{$comic}{'url'}">
                    198:                $comics->{$comic}{'url'}
                    199:        </a>
                    200: </font><br/>
1.17      nick      201: <img src="../images/$date->{'mon2'}$date->{'year2'}/$comic-$date->{'day2'}.$comics->{$comic}{'ext'}" alt="$comic-$date->{'day2'}" />
1.1       nick      202: <br/><br/>
                    203: </td></tr>
                    204: <!-- ********* Finish $comic ($comics->{$comic}{'fullName'}) ******* -->
                    205: 
                    206: EOF
                    207:        open INDEX, ">>$indexFile";
                    208: 
                    209:        print INDEX $content if ( ! $comics->{$comic}{'error'} );
                    210: 
                    211:        print INDEX <<EOF
                    212: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font> &nbsp; &nbsp;
                    213: <font size="-2"><
                    214:         <a href="$comics->{$comic}{'url'}">
                    215:                 $comics->{$comic}{'url'}
                    216:         </a>
                    217: </font><br/>
                    218: <font color="red"><b>$comic :  $comics->{$comic}{'error'}</b></font><br/>
                    219:   </td>
                    220: </tr>
                    221: EOF
                    222:                if ( $comics->{$comic}{'error'} );
                    223: 
                    224:        close (INDEX);
                    225: 
                    226:        return 0;
                    227: }
                    228: 
                    229: 
                    230: #######################################################################
                    231: #######################################################################
                    232: sub writeMainIndex ($$) {
                    233:        my ( $date ) = @_;
                    234: 
                    235: }
                    236: 
                    237: 
                    238: #######################################################################
                    239: #######################################################################
                    240: sub writeFooter {
                    241:        my ( $date ) = @_;
1.11      nick      242:        my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12      nick      243:        my $indexFile = $indexDir . "/index-" . $date->{'year2'} . 
                    244:                        $date->{'mon2'} . $date->{'day2'} . "-" . 
                    245:                        $sd . ".html";
1.1       nick      246:        my $sysDate = `date`;
                    247: 
                    248:        open INDEX, ">>$indexFile";
                    249:        print INDEX <<EOF;
                    250: </table>
1.3       nick      251: <center>
                    252: <font size="2">
                    253: Generated on: <font color="green">$sysDate</font><br/>
1.7       nick      254: Version: <font color="green">$ver</font><br />
1.22      nick      255: Config Version: <font color="green">$comicConfigVer</font><br />
1.7       nick      256: CVS: <a href="http://demandred.dyndns.org/cgi-bin/cvsweb/comics/">http://demandred.dyndns.org/cgi-bin/cvsweb/comics/</a>
1.1       nick      257:   <p>
                    258:     <a href="http://validator.w3.org/check?uri=referer"><img
                    259:       src="http://www.w3.org/Icons/valid-xhtml10-blue" alt="Valid XHTML 1.0 Transitional" height="31" width="88" border="0" /></a>
                    260:   </p>
                    261: </center>
                    262: 
                    263: </body>
                    264: </html>
                    265: EOF
                    266:        close( INDEX );
                    267: }
                    268: 
                    269: #######################################################################
                    270: #######################################################################
                    271: sub checkDir ($$) {
                    272:        my @dir = @_;
                    273: 
                    274:        foreach ( @dir ) {
                    275:                if ( ! -d $_ ) { mkpath( $_ ); }
                    276:        }
                    277: }
                    278: 
                    279: #######################################################################
                    280: #######################################################################
                    281: sub writeTitle ($$) {
                    282:        my ( $date ) = @_;
1.11      nick      283:        my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12      nick      284:        my $indexFile = $indexDir . "/index-" . $date->{'year2'} . 
                    285:                        $date->{'mon2'} . $date->{'day2'} . "-" . 
                    286:                        $sd . ".html";
1.8       nick      287:        my $today     = $days[$date->{'dow'}] . " " . $date->{'mon'} . "/" . $date->{'day'} . "/" . $date->{'year'};
1.16      nick      288:     my $today_long = Date_to_Text_Long(Today());
1.1       nick      289: 
                    290:        open INDEX, ">$indexFile";
                    291:        print INDEX <<EOF;
                    292: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
                    293: 
                    294: <html xmlns="http://www.w3.org/1999/xhtml">
                    295: <head>
                    296: <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
1.13      nick      297: <link href="/daily/comics/comics.css" type="text/css" rel="stylesheet" media="screen">
1.18      nick      298: <link rel="shortcut icon" href="./favicon.ico">
1.1       nick      299:     <title>Daily Comics for $today</title>
                    300:   </head>
                    301: <body bgcolor="#FFFFFF">
                    302: <table align="center" cellpadding="5" cellspacing="0">
1.16      nick      303: <tr><td>
                    304: <table cellpadding="0" cellspacing="0" border="0">
                    305: <tr><td align="Left"><img src="images/daily_comics_heading01.png"></td></tr>
                    306: <tr><td align="left">$today_long</td></tr>
                    307: <tr><td>&nbsp;</td></tr>
                    308: </td</tr>
                    309: 
1.1       nick      310: EOF
                    311:        close (INDEX);
                    312: }
                    313: 
                    314: #######################################################################
                    315: #######################################################################
                    316: sub directDownload ($$) {
                    317:        my ( $comics, $comic, $date ) = @_;
                    318:        my $file = &parseComic ( $comics, $comic, $date );
                    319: 
                    320:         ##
                    321:         ## Save the file to the appropriate directory
                    322:         ##
                    323:         my $cDir  = $date->{'mon2'} . $date->{'year2'};
                    324:         my $cDate = $date->{'day2'};
                    325: 
1.18      nick      326:        my $cmd = "wget -q $file --referer=\"" . $comics->{$comic}{'url'} ."\" --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg";
1.14      nick      327: 
1.1       nick      328:         return system($cmd);
                    329: }
                    330: 
                    331: #######################################################################
                    332: #######################################################################
                    333: sub indexDownload ($$) {
                    334:        my ( $comics, $comic, $date ) = @_;
                    335:        my ( @lines, $comicLine, $mainURL );
                    336:        my $comicIndex = "indexes/index.$comic";
                    337: 
1.19      nick      338:     my $wget_cmd = "wget -q --referer=\"$comics->{$comic}{'url'}\" " .
                    339:                    "--user-agent=\"$USER_AGENT\" " .
                    340:                    "$comics->{$comic}{'url'} -O $comicIndex";
                    341:     system($wget_cmd);
1.1       nick      342: 
                    343:        if ( ! open FILEN, "<$comicIndex" ) {  
                    344:                return "ERROR: Can't open index file for " . $comics->{$comic}{'fullName'} . 
                    345:                       " (" . $comics->{$comic}{'url'} . ")"; 
                    346:        } 
1.23    ! nick      347:     while (<FILEN>) {
        !           348:         my $line = $_;
        !           349:         $line =~ s/\R|\ \ +|\t//g if ( $comics->{$comic}{'remove_newliens'} );
        !           350:        push @lines, $line;
        !           351:     }
1.1       nick      352:        close (FILEN);  
                    353: 
                    354:        unlink ("$comicIndex");
                    355: 
                    356:        $mainURL = $comics->{$comic}{'url'};
                    357:        ## I need to figure out how to merge these two in to one regex.
                    358:        $mainURL =~ s/(http:\/\/.*)(?:\/.*\/){1,}.*/$1/;
                    359:        $mainURL =~ s/([a-z])\/.*/$1/i;
                    360: 
                    361:        ##
                    362:        ## Find the comic strip URL based on the specified regex in the search
                    363:        ##
                    364:        foreach my $line (@lines) {
1.17      nick      365:                if ( $line =~ m/$comics->{$comic}{'search'}/i ) {
1.1       nick      366:                        $comicLine = $1; chomp $comicLine;
                    367:                }
1.17      nick      368:     }
1.1       nick      369: 
                    370:        ##
                    371:        ## Save the file to the appropriate directory
                    372:        ##
                    373:        my $cDir    = $date->{'mon2'} . $date->{'year2'};
                    374:        my $cDate   = $date->{'day2'};
                    375: 
                    376:        if ( $comicLine ) {
                    377:                if ( $comicLine =~ m/(gif|jpg|png)/i ) { $comics->{$comic}{'ext'} = $1; }
                    378:                my $comicURL = ( $comicLine =~ m/http/ ) ? $comicLine : $mainURL . $comicLine;
1.17      nick      379:                my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer=\"" . $comics->{$comic}{'url'} . "\" -q $comicURL -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}";
1.1       nick      380:                system( $cmd );
                    381:                return 0;
                    382:        }
                    383: 
                    384:        unlink "index.html";
                    385: 
                    386:        return "ERROR: Could not download comic $comics->{$comic}{'fullName'}";
                    387: }
                    388: 
                    389: #######################################################################
                    390: #######################################################################
                    391: sub parseComic ($$) {
                    392:        my ( $comics, $comic, $date ) = @_;
                    393:        my $string = $comics->{$comic}{'search'};
                    394: 
                    395:        $string =~ s/__year__/$date->{'year'}/g;
                    396:        $string =~ s/__year2__/$date->{'year2'}/g;
                    397:        $string =~ s/__mon__/$date->{'mon'}/g;
                    398:        $string =~ s/__mon2__/$date->{'mon2'}/g;
                    399:        $string =~ s/__day__/$date->{'day'}/g;
                    400:        $string =~ s/__day2__/$date->{'day2'}/g;
                    401:        $string =~ s/__ext__/$comics->{$comic}{'ext'}/g;
                    402:        chomp $string;
                    403: 
                    404:        return $string;
                    405: }
                    406: 
                    407: #######################################################################
                    408: #######################################################################
                    409: sub fetchDates () {
                    410:        my %dates = ();
                    411: 
1.8       nick      412:        ($dates{'day'}, $dates{'mon'}, $dates{'year'}, $dates{'dow'}) = (localtime(time - (86400 * $days_ago )))[3,4,5,6];
1.1       nick      413: 
                    414:        $dates{'year'} += 1900;
                    415:        $dates{'year2'} = substr $dates{'year'}, 2, 2;
                    416:        $dates{'day2'}  = ( $dates{'day'} < 10 ) ? "0" . $dates{'day'} : $dates{'day'}; 
                    417:        $dates{'mon'}++;
                    418:        $dates{'mon2'}  = ( $dates{'mon'} < 10 ) ? "0".$dates{'mon'} : $dates{'mon'};
1.21      nick      419:     my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
                    420:     $dates{'wday'} = $days[$dates{'dow'}];
1.1       nick      421: 
                    422:        return %dates;
                    423: }
1.8       nick      424: 
                    425: ###############################################################################
                    426: ##
                    427: ## &fetchOptions( );
                    428: ##
                    429: ##      Grab our command line arguments and toss them in to a hash
                    430: ##
                    431: ###############################################################################
                    432: sub fetchOptions {
                    433:         my %opts;
                    434: 
                    435:         &GetOptions(
                    436:                         "days:i"        => \$opts{'days'},
                    437:                         "help|?"        => \$opts{'help'},
                    438:                         "man"           => \$opts{'man'},
                    439:                    ) || &pod2usage( );
                    440:         &pod2usage( ) if defined $opts{'help'};
                    441:         &pod2usage( { -verbose => 2, -input => \*DATA } ) if defined $opts{'man'};
                    442: 
                    443:         return %opts;
                    444: }
                    445: 
                    446: __END__
                    447: 
                    448: =head1 NAME
                    449: 
                    450: fetch.pl - Fetches comics and places them all locally in a single html file.
                    451: 
                    452: =head1 SYNOPSIS
                    453: 
                    454: fetch.pl [options]
                    455: 
                    456: Options:
                    457:         --days,d        Fetch comics from X days ago
                    458:         --help,?        Display the basic help menu
                    459:         --man,m         Display the detailed man page
                    460: 
                    461: =head1 DESCRIPTION
                    462: 
                    463: =head1 HISTORY
                    464: 
                    465: =head1 AUTHOR
                    466: 
                    467: Nicholas DeClario <nick@declario.com>
                    468: 
                    469: =head1 BUGS
                    470: 
                    471: This is a work in progress.  Please report all bugs to the author.
                    472: 
                    473: =head1 SEE ALSO
                    474: 
                    475: =head1 COPYRIGHT
                    476: 
                    477: =cut
                    478: 
                    479: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>