Annotation of comics/fetch.pl.new, revision 1.20

1.1       nick        1: #!/usr/bin/perl -w
                      2: 
1.15      nick        3: ###############################################################################
1.16      nick        4: # $Log: fetch.pl.new,v $
1.20    ! nick        5: # Revision 1.19  2015/07/13 12:56:58  nick
        !             6: # Added Sally Forth and Pearls Before Swine.  Adding Sally Forth required a change in the 'wget' command for fetching the index file to include 'user-agent' and 'referer'.
        !             7: #
1.19      nick        8: # Revision 1.18  2015/05/07 12:31:43  nick
                      9: # Added favicon
                     10: #
1.18      nick       11: # Revision 1.17  2015/02/19 14:56:10  nick
                     12: # Fixed a problem that forced everything to JPG.  This would kill GIF animations, but would not display the gifs either because 'convert' appends an index number to the end of the file name for each from of the GIF animation.  I fixed this to maintain GIF compatibilty as well as rewritting how the script fetches the size of the file.  Additionally, I updated the configuration for Questionable Content to search for GIF or JPG, which is what triggered this entire update.
                     13: #
1.17      nick       14: # Revision 1.16  2015/02/05 18:05:58  nick
                     15: # Changed the background and added a fancy title.
                     16: #
1.16      nick       17: # Revision 1.15  2015/01/19 13:46:19  nick
                     18: # *** empty log message ***
                     19: #
1.15      nick       20: ###############################################################################
                     21: 
1.1       nick       22: use strict;
                     23: use File::Path;
                     24: use Data::Dumper;
1.8       nick       25: use Pod::Usage;
                     26: use Getopt::Long;
1.1       nick       27: 
1.16      nick       28: use Date::Calc qw/Date_to_Text_Long Today/;
                     29: 
1.1       nick       30: ## 
                     31: ## Some default values
                     32: ##
1.20    ! nick       33: my $ver                = '$Id: fetch.pl.new,v 1.19 2015/07/13 12:56:58 nick Exp $';
1.1       nick       34: my $comicFile   = "comics.conf";
                     35: my %comics     = &readComicConfig ( $comicFile );
1.8       nick       36: my %opts        = &fetchOptions( );
                     37: my $days_ago    = $opts{'days'} || 0;
1.1       nick       38: my %dates      = &fetchDates();
                     39: my $baseDir     = $comics{'configs'}{'base_directory'} || ".";
                     40: my $imageDir    = $baseDir . "/" . ( $comics{'configs'}{'image_directory'} || "images" ) . 
                     41:                  "/$dates{'mon2'}$dates{'year2'}";
                     42: my $indexDir    = $baseDir . "/" . ( $comics{'configs'}{'index_directory'} || "indexes" );
1.2       nick       43: my $USER_AGENT  = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18) Gecko/20110628 Ubuntu/10.10 (maverick) Firefox/3.6.18";
1.8       nick       44: my @days        = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
1.1       nick       45: 
                     46: my $DATE=`date`; chomp $DATE;
                     47: print STDOUT "Starting comic fetch at $DATE\n";
                     48: 
                     49: ##
                     50: ## Main program starts here
                     51: ##
                     52: &checkDir ( [ $imageDir, $indexDir ] );
                     53: 
1.5       nick       54: &writeTitle ( \%dates );
1.1       nick       55: 
                     56: foreach my $comic ( sort keys %comics ) {
1.20    ! nick       57: 
        !            58:   ## Skip if this is Sunday and the comic is weekdays only
1.1       nick       59:   next if ( $comic =~ m/config/ );
1.20    ! nick       60:   if (($dates{'day2'} eq "Sunday") && 
        !            61:       ($comics{$comic}{'sunday'} == 0)) {
        !            62:     print "Skipping '$comic'; Weekdays only.\n";
        !            63:     next;
        !            64:   }
        !            65: 
        !            66:   ## Skip if Sunday only comic and it's not Sunday.
        !            67:   if (($dates{'day2'} ne "Sunday") &&
        !            68:       ($comics{$comic}{'sunday_only'} == 1)) {
        !            69:     print "Skipping '$comic'; Sunday only.\n";
        !            70:     next
        !            71:   }
        !            72: 
1.1       nick       73:   $comics{$comic}{'error'} = &downloadComic ( \%comics, $comic, \%dates );
                     74:   &writeComic ( \%comics, $comic, \%dates );
                     75: 
1.17      nick       76:     my $file = "$imageDir/$comic-$dates{'day2'}.$comics{$comic}{'ext'}";
                     77:     my $size = 0;    
                     78: 
                     79:     my $cmd = "/usr/bin/identify -verbose $file|";
                     80:     open(IMG, $cmd) || die ("Can't open: $!\n");
                     81:     while(<IMG>) {
                     82:         if ($_ =~ m/^\s+geometry:\s+(\d+)x\d+.*/i) {
                     83:             $size = $1 if ( $size == 0);
                     84:         }
                     85:     }
                     86:     close(IMG);
1.4       nick       87: 
1.19      nick       88: 
1.4       nick       89:        system( "/usr/bin/convert -resize 640 $file $file" )
                     90:                if ( $size > 640 ) 
                     91: }
                     92: 
1.1       nick       93: ## &writeMainIndex ( \%dates );
                     94: 
                     95: &writeFooter( \%dates );
                     96: 
                     97: $DATE=`date`;  chomp( $DATE );
                     98: print STDOUT "Completed comic fetch at $DATE\n";
                     99: 
                    100: ## End
                    101: 
                    102: #######################################################################
                    103: ## Function :  downloadComic
                    104: ##
                    105: ##   Description :
                    106: ##     This function determines the download method being used to 
                    107: ##      retrieve the comic and calls the apprioriate function.
                    108: ##
                    109: ##      If the mode is invalid an error will be returned.
                    110: ##
                    111: #######################################################################
                    112: sub downloadComic ($$) {
                    113:        my ( $comics, $comic, $date ) = @_;
                    114: 
                    115:        SWITCH: {
                    116:                if ( $comics->{$comic}{'mode'} eq 1 ) { 
                    117:                        return indexDownload ( \%comics, $comic, $date );
                    118:                        last SWITCH;
                    119:                }
                    120:                if ( $comics->{$comic}{'mode'} eq 2 ) { 
                    121:                        return directDownload ( \%comics, $comic, $date );
                    122:                        last SWITCH;
                    123:                }
                    124:        }
                    125:         
                    126:        return "ERROR: Unknown download method specified for $comics->{$comic}{'fullName'}.";
                    127: }
                    128: 
                    129: #######################################################################
                    130: #######################################################################
                    131: sub readComicConfig ($$) {
                    132:        my ( $comicFile ) = @_;
                    133:        my %comicConfig   = ( );
                    134:        my %config        = ( );
                    135: 
1.14      nick      136:     my ($year, $mon, $day) =( localtime(time))[5,4,3];
                    137:     $year += 1900;
                    138:     $mon = sprintf("%02d", ($mon + 1));
                    139:     $day = sprintf("%02d", $day);
                    140: 
1.1       nick      141:        open FILEN, "<$comicFile";
                    142:                while (<FILEN>) {
                    143:                        if ( ( $_ !~ m/^#/ ) && ( $_ =~ m/,.*,/) ){
1.14      nick      144:                 $_ =~ s/__YEAR__/$year/g;
                    145:                 $_ =~ s/__MON__/$mon/g;
                    146:                 $_ =~ s/__DAY__/$day/g;
                    147:                 
1.1       nick      148:                                my @res = split /,/, $_;
                    149:                                $comicConfig{$res[0]}{'url'}      = $res[1];
                    150:                                $comicConfig{$res[0]}{'search'}   = $res[2];
                    151:                                $comicConfig{$res[0]}{'mode'}     = $res[3];
                    152:                                $comicConfig{$res[0]}{'fullName'} = $res[4];
                    153:                                $comicConfig{$res[0]}{'ext'}      = $res[5];
1.14      nick      154:                 $comicConfig{$res[0]}{'sunday'}   = $res[6] || 1;
1.20    ! nick      155:                 $comicConfig{$res[0]}{'sunday_only'} = $res[7] || 0;
1.1       nick      156:                                $comicConfig{$res[0]}{'error'}    = 0;
                    157:                        }
                    158:                        elsif ( $_ =~ m/(.*)\s+=\s+(.*)/ ) {
                    159:                                $comicConfig{'configs'}{$1} = $2;
                    160:                        }
                    161:                }
                    162:        close (FILEN);
                    163: 
                    164:        return %comicConfig;
                    165: }
                    166: 
                    167: #######################################################################
                    168: #######################################################################
                    169: sub writeComic ($$) {
                    170:        my ( $comics, $comic, $date ) = @_;
1.11      nick      171:        my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12      nick      172:        my $indexFile = $indexDir . "/index-" . $date->{'year2'} . 
                    173:                        $date->{'mon2'} . $date->{'day2'} . "-" . 
                    174:                        $sd . ".html";
1.1       nick      175:        my $content = <<EOF;
                    176: 
                    177: <!-- ********* Begin $comic ($comics->{$comic}{'fullName'}) ******* -->
                    178:   <tr>
                    179:     <td align="left">
                    180: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font> &nbsp; &nbsp; 
                    181: <font size="-2">
                    182:        <a href="$comics->{$comic}{'url'}">
                    183:                $comics->{$comic}{'url'}
                    184:        </a>
                    185: </font><br/>
1.17      nick      186: <img src="../images/$date->{'mon2'}$date->{'year2'}/$comic-$date->{'day2'}.$comics->{$comic}{'ext'}" alt="$comic-$date->{'day2'}" />
1.1       nick      187: <br/><br/>
                    188: </td></tr>
                    189: <!-- ********* Finish $comic ($comics->{$comic}{'fullName'}) ******* -->
                    190: 
                    191: EOF
                    192:        open INDEX, ">>$indexFile";
                    193: 
                    194:        print INDEX $content if ( ! $comics->{$comic}{'error'} );
                    195: 
                    196:        print INDEX <<EOF
                    197: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font> &nbsp; &nbsp;
                    198: <font size="-2"><
                    199:         <a href="$comics->{$comic}{'url'}">
                    200:                 $comics->{$comic}{'url'}
                    201:         </a>
                    202: </font><br/>
                    203: <font color="red"><b>$comic :  $comics->{$comic}{'error'}</b></font><br/>
                    204:   </td>
                    205: </tr>
                    206: EOF
                    207:                if ( $comics->{$comic}{'error'} );
                    208: 
                    209:        close (INDEX);
                    210: 
                    211:        return 0;
                    212: }
                    213: 
                    214: 
                    215: #######################################################################
                    216: #######################################################################
                    217: sub writeMainIndex ($$) {
                    218:        my ( $date ) = @_;
                    219: 
                    220: }
                    221: 
                    222: 
                    223: #######################################################################
                    224: #######################################################################
                    225: sub writeFooter {
                    226:        my ( $date ) = @_;
1.11      nick      227:        my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12      nick      228:        my $indexFile = $indexDir . "/index-" . $date->{'year2'} . 
                    229:                        $date->{'mon2'} . $date->{'day2'} . "-" . 
                    230:                        $sd . ".html";
1.1       nick      231:        my $sysDate = `date`;
                    232: 
                    233:        open INDEX, ">>$indexFile";
                    234:        print INDEX <<EOF;
                    235: </table>
1.3       nick      236: <center>
                    237: <font size="2">
                    238: Generated on: <font color="green">$sysDate</font><br/>
1.7       nick      239: Version: <font color="green">$ver</font><br />
                    240: CVS: <a href="http://demandred.dyndns.org/cgi-bin/cvsweb/comics/">http://demandred.dyndns.org/cgi-bin/cvsweb/comics/</a>
1.1       nick      241:   <p>
                    242:     <a href="http://validator.w3.org/check?uri=referer"><img
                    243:       src="http://www.w3.org/Icons/valid-xhtml10-blue" alt="Valid XHTML 1.0 Transitional" height="31" width="88" border="0" /></a>
                    244:   </p>
                    245: </center>
                    246: 
                    247: </body>
                    248: </html>
                    249: EOF
                    250:        close( INDEX );
                    251: }
                    252: 
                    253: #######################################################################
                    254: #######################################################################
                    255: sub checkDir ($$) {
                    256:        my @dir = @_;
                    257: 
                    258:        foreach ( @dir ) {
                    259:                if ( ! -d $_ ) { mkpath( $_ ); }
                    260:        }
                    261: }
                    262: 
                    263: #######################################################################
                    264: #######################################################################
                    265: sub writeTitle ($$) {
                    266:        my ( $date ) = @_;
1.11      nick      267:        my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12      nick      268:        my $indexFile = $indexDir . "/index-" . $date->{'year2'} . 
                    269:                        $date->{'mon2'} . $date->{'day2'} . "-" . 
                    270:                        $sd . ".html";
1.8       nick      271:        my $today     = $days[$date->{'dow'}] . " " . $date->{'mon'} . "/" . $date->{'day'} . "/" . $date->{'year'};
1.16      nick      272:     my $today_long = Date_to_Text_Long(Today());
1.1       nick      273: 
                    274:        open INDEX, ">$indexFile";
                    275:        print INDEX <<EOF;
                    276: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
                    277: 
                    278: <html xmlns="http://www.w3.org/1999/xhtml">
                    279: <head>
                    280: <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
1.13      nick      281: <link href="/daily/comics/comics.css" type="text/css" rel="stylesheet" media="screen">
1.18      nick      282: <link rel="shortcut icon" href="./favicon.ico">
1.1       nick      283:     <title>Daily Comics for $today</title>
                    284:   </head>
                    285: <body bgcolor="#FFFFFF">
                    286: <table align="center" cellpadding="5" cellspacing="0">
1.16      nick      287: <tr><td>
                    288: <table cellpadding="0" cellspacing="0" border="0">
                    289: <tr><td align="Left"><img src="images/daily_comics_heading01.png"></td></tr>
                    290: <tr><td align="left">$today_long</td></tr>
                    291: <tr><td>&nbsp;</td></tr>
                    292: </td</tr>
                    293: 
1.1       nick      294: EOF
                    295:        close (INDEX);
                    296: }
                    297: 
                    298: #######################################################################
                    299: #######################################################################
                    300: sub directDownload ($$) {
                    301:        my ( $comics, $comic, $date ) = @_;
                    302:        my $file = &parseComic ( $comics, $comic, $date );
                    303: 
                    304:         ##
                    305:         ## Save the file to the appropriate directory
                    306:         ##
                    307:         my $cDir  = $date->{'mon2'} . $date->{'year2'};
                    308:         my $cDate = $date->{'day2'};
                    309: 
1.18      nick      310:        my $cmd = "wget -q $file --referer=\"" . $comics->{$comic}{'url'} ."\" --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg";
1.14      nick      311: 
1.1       nick      312:         return system($cmd);
                    313: }
                    314: 
                    315: #######################################################################
                    316: #######################################################################
                    317: sub indexDownload ($$) {
                    318:        my ( $comics, $comic, $date ) = @_;
                    319:        my ( @lines, $comicLine, $mainURL );
                    320:        my $comicIndex = "indexes/index.$comic";
                    321: 
1.19      nick      322:     my $wget_cmd = "wget -q --referer=\"$comics->{$comic}{'url'}\" " .
                    323:                    "--user-agent=\"$USER_AGENT\" " .
                    324:                    "$comics->{$comic}{'url'} -O $comicIndex";
                    325:     system($wget_cmd);
1.1       nick      326: 
                    327:        if ( ! open FILEN, "<$comicIndex" ) {  
                    328:                return "ERROR: Can't open index file for " . $comics->{$comic}{'fullName'} . 
                    329:                       " (" . $comics->{$comic}{'url'} . ")"; 
                    330:        } 
                    331:                @lines = <FILEN>;
                    332:        close (FILEN);  
                    333: 
                    334:        unlink ("$comicIndex");
                    335: 
                    336:        $mainURL = $comics->{$comic}{'url'};
                    337:        ## I need to figure out how to merge these two in to one regex.
                    338:        $mainURL =~ s/(http:\/\/.*)(?:\/.*\/){1,}.*/$1/;
                    339:        $mainURL =~ s/([a-z])\/.*/$1/i;
                    340: 
                    341:        ##
                    342:        ## Find the comic strip URL based on the specified regex in the search
                    343:        ##
                    344:        foreach my $line (@lines) {
1.17      nick      345:                if ( $line =~ m/$comics->{$comic}{'search'}/i ) {
1.1       nick      346:                        $comicLine = $1; chomp $comicLine;
                    347:                }
1.17      nick      348:     }
1.1       nick      349: 
                    350:        ##
                    351:        ## Save the file to the appropriate directory
                    352:        ##
                    353:        my $cDir    = $date->{'mon2'} . $date->{'year2'};
                    354:        my $cDate   = $date->{'day2'};
                    355: 
                    356:        if ( $comicLine ) {
                    357:                if ( $comicLine =~ m/(gif|jpg|png)/i ) { $comics->{$comic}{'ext'} = $1; }
                    358:                my $comicURL = ( $comicLine =~ m/http/ ) ? $comicLine : $mainURL . $comicLine;
1.17      nick      359:                my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer=\"" . $comics->{$comic}{'url'} . "\" -q $comicURL -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}";
1.1       nick      360:                system( $cmd );
                    361:                return 0;
                    362:        }
                    363: 
                    364:        unlink "index.html";
                    365: 
                    366:        return "ERROR: Could not download comic $comics->{$comic}{'fullName'}";
                    367: }
                    368: 
                    369: #######################################################################
                    370: #######################################################################
                    371: sub parseComic ($$) {
                    372:        my ( $comics, $comic, $date ) = @_;
                    373:        my $string = $comics->{$comic}{'search'};
                    374: 
                    375:        $string =~ s/__year__/$date->{'year'}/g;
                    376:        $string =~ s/__year2__/$date->{'year2'}/g;
                    377:        $string =~ s/__mon__/$date->{'mon'}/g;
                    378:        $string =~ s/__mon2__/$date->{'mon2'}/g;
                    379:        $string =~ s/__day__/$date->{'day'}/g;
                    380:        $string =~ s/__day2__/$date->{'day2'}/g;
                    381:        $string =~ s/__ext__/$comics->{$comic}{'ext'}/g;
                    382:        chomp $string;
                    383: 
                    384:        return $string;
                    385: }
                    386: 
                    387: #######################################################################
                    388: #######################################################################
                    389: sub fetchDates () {
                    390:        my %dates = ();
                    391: 
1.8       nick      392:        ($dates{'day'}, $dates{'mon'}, $dates{'year'}, $dates{'dow'}) = (localtime(time - (86400 * $days_ago )))[3,4,5,6];
1.1       nick      393: 
                    394:        $dates{'year'} += 1900;
                    395:        $dates{'year2'} = substr $dates{'year'}, 2, 2;
                    396:        $dates{'day2'}  = ( $dates{'day'} < 10 ) ? "0" . $dates{'day'} : $dates{'day'}; 
                    397:        $dates{'mon'}++;
                    398:        $dates{'mon2'}  = ( $dates{'mon'} < 10 ) ? "0".$dates{'mon'} : $dates{'mon'};
                    399: 
                    400:        return %dates;
                    401: }
1.8       nick      402: 
                    403: ###############################################################################
                    404: ##
                    405: ## &fetchOptions( );
                    406: ##
                    407: ##      Grab our command line arguments and toss them in to a hash
                    408: ##
                    409: ###############################################################################
                    410: sub fetchOptions {
                    411:         my %opts;
                    412: 
                    413:         &GetOptions(
                    414:                         "days:i"        => \$opts{'days'},
                    415:                         "help|?"        => \$opts{'help'},
                    416:                         "man"           => \$opts{'man'},
                    417:                    ) || &pod2usage( );
                    418:         &pod2usage( ) if defined $opts{'help'};
                    419:         &pod2usage( { -verbose => 2, -input => \*DATA } ) if defined $opts{'man'};
                    420: 
                    421:         return %opts;
                    422: }
                    423: 
                    424: __END__
                    425: 
                    426: =head1 NAME
                    427: 
                    428: fetch.pl - Fetches comics and places them all locally in a single html file.
                    429: 
                    430: =head1 SYNOPSIS
                    431: 
                    432: fetch.pl [options]
                    433: 
                    434: Options:
                    435:         --days,d        Fetch comics from X days ago
                    436:         --help,?        Display the basic help menu
                    437:         --man,m         Display the detailed man page
                    438: 
                    439: =head1 DESCRIPTION
                    440: 
                    441: =head1 HISTORY
                    442: 
                    443: =head1 AUTHOR
                    444: 
                    445: Nicholas DeClario <nick@declario.com>
                    446: 
                    447: =head1 BUGS
                    448: 
                    449: This is a work in progress.  Please report all bugs to the author.
                    450: 
                    451: =head1 SEE ALSO
                    452: 
                    453: =head1 COPYRIGHT
                    454: 
                    455: =cut
                    456: 
                    457: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>