Annotation of comics/fetch.pl.new, revision 1.18

1.1       nick        1: #!/usr/bin/perl -w
                      2: 
1.15      nick        3: ###############################################################################
1.16      nick        4: # $Log: fetch.pl.new,v $
1.18    ! nick        5: # Revision 1.17  2015/02/19 14:56:10  nick
        !             6: # Fixed a problem that forced everything to JPG.  This would kill GIF animations, but would not display the gifs either because 'convert' appends an index number to the end of the file name for each from of the GIF animation.  I fixed this to maintain GIF compatibilty as well as rewritting how the script fetches the size of the file.  Additionally, I updated the configuration for Questionable Content to search for GIF or JPG, which is what triggered this entire update.
        !             7: #
1.17      nick        8: # Revision 1.16  2015/02/05 18:05:58  nick
                      9: # Changed the background and added a fancy title.
                     10: #
1.16      nick       11: # Revision 1.15  2015/01/19 13:46:19  nick
                     12: # *** empty log message ***
                     13: #
1.15      nick       14: ###############################################################################
                     15: 
1.1       nick       16: use strict;
                     17: use File::Path;
                     18: use Data::Dumper;
1.8       nick       19: use Pod::Usage;
                     20: use Getopt::Long;
1.1       nick       21: 
1.16      nick       22: use Date::Calc qw/Date_to_Text_Long Today/;
                     23: 
1.1       nick       24: ## 
                     25: ## Some default values
                     26: ##
1.18    ! nick       27: my $ver                = '$Id: fetch.pl.new,v 1.17 2015/02/19 14:56:10 nick Exp $';
1.1       nick       28: my $comicFile   = "comics.conf";
                     29: my %comics     = &readComicConfig ( $comicFile );
1.8       nick       30: my %opts        = &fetchOptions( );
                     31: my $days_ago    = $opts{'days'} || 0;
1.1       nick       32: my %dates      = &fetchDates();
                     33: my $baseDir     = $comics{'configs'}{'base_directory'} || ".";
                     34: my $imageDir    = $baseDir . "/" . ( $comics{'configs'}{'image_directory'} || "images" ) . 
                     35:                  "/$dates{'mon2'}$dates{'year2'}";
                     36: my $indexDir    = $baseDir . "/" . ( $comics{'configs'}{'index_directory'} || "indexes" );
1.2       nick       37: my $USER_AGENT  = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18) Gecko/20110628 Ubuntu/10.10 (maverick) Firefox/3.6.18";
1.8       nick       38: my @days        = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
1.1       nick       39: 
                     40: my $DATE=`date`; chomp $DATE;
                     41: print STDOUT "Starting comic fetch at $DATE\n";
                     42: 
                     43: ##
                     44: ## Main program starts here
                     45: ##
                     46: &checkDir ( [ $imageDir, $indexDir ] );
                     47: 
1.5       nick       48: &writeTitle ( \%dates );
1.1       nick       49: 
                     50: foreach my $comic ( sort keys %comics ) {
                     51:   next if ( $comic =~ m/config/ );
1.14      nick       52:   if ( ( $dates{'day2'} eq "Sunday" ) && 
                     53:        ( $comics{$comic}{'sunday'} == 0 ) ) { print "Skipping.\n"; next; }
1.1       nick       54:   $comics{$comic}{'error'} = &downloadComic ( \%comics, $comic, \%dates );
                     55:   &writeComic ( \%comics, $comic, \%dates );
                     56: 
1.17      nick       57:     my $file = "$imageDir/$comic-$dates{'day2'}.$comics{$comic}{'ext'}";
                     58:     my $size = 0;    
                     59: 
                     60:     my $cmd = "/usr/bin/identify -verbose $file|";
                     61:     open(IMG, $cmd) || die ("Can't open: $!\n");
                     62:     while(<IMG>) {
                     63:         if ($_ =~ m/^\s+geometry:\s+(\d+)x\d+.*/i) {
                     64:             $size = $1 if ( $size == 0);
                     65:         }
                     66:     }
                     67:     close(IMG);
1.4       nick       68: 
                     69:        system( "/usr/bin/convert -resize 640 $file $file" )
                     70:                if ( $size > 640 ) 
                     71: }
                     72: 
1.1       nick       73: ## &writeMainIndex ( \%dates );
                     74: 
                     75: &writeFooter( \%dates );
                     76: 
                     77: $DATE=`date`;  chomp( $DATE );
                     78: print STDOUT "Completed comic fetch at $DATE\n";
                     79: 
                     80: ## End
                     81: 
                     82: #######################################################################
                     83: ## Function :  downloadComic
                     84: ##
                     85: ##   Description :
                     86: ##     This function determines the download method being used to 
                     87: ##      retrieve the comic and calls the apprioriate function.
                     88: ##
                     89: ##      If the mode is invalid an error will be returned.
                     90: ##
                     91: #######################################################################
                     92: sub downloadComic ($$) {
                     93:        my ( $comics, $comic, $date ) = @_;
                     94: 
                     95:        SWITCH: {
                     96:                if ( $comics->{$comic}{'mode'} eq 1 ) { 
                     97:                        return indexDownload ( \%comics, $comic, $date );
                     98:                        last SWITCH;
                     99:                }
                    100:                if ( $comics->{$comic}{'mode'} eq 2 ) { 
                    101:                        return directDownload ( \%comics, $comic, $date );
                    102:                        last SWITCH;
                    103:                }
                    104:        }
                    105:         
                    106:        return "ERROR: Unknown download method specified for $comics->{$comic}{'fullName'}.";
                    107: }
                    108: 
                    109: #######################################################################
                    110: #######################################################################
                    111: sub readComicConfig ($$) {
                    112:        my ( $comicFile ) = @_;
                    113:        my %comicConfig   = ( );
                    114:        my %config        = ( );
                    115: 
1.14      nick      116:     my ($year, $mon, $day) =( localtime(time))[5,4,3];
                    117:     $year += 1900;
                    118:     $mon = sprintf("%02d", ($mon + 1));
                    119:     $day = sprintf("%02d", $day);
                    120: 
1.1       nick      121:        open FILEN, "<$comicFile";
                    122:                while (<FILEN>) {
                    123:                        if ( ( $_ !~ m/^#/ ) && ( $_ =~ m/,.*,/) ){
1.14      nick      124:                 $_ =~ s/__YEAR__/$year/g;
                    125:                 $_ =~ s/__MON__/$mon/g;
                    126:                 $_ =~ s/__DAY__/$day/g;
                    127:                 
1.1       nick      128:                                my @res = split /,/, $_;
                    129:                                $comicConfig{$res[0]}{'url'}      = $res[1];
                    130:                                $comicConfig{$res[0]}{'search'}   = $res[2];
                    131:                                $comicConfig{$res[0]}{'mode'}     = $res[3];
                    132:                                $comicConfig{$res[0]}{'fullName'} = $res[4];
                    133:                                $comicConfig{$res[0]}{'ext'}      = $res[5];
1.14      nick      134:                 $comicConfig{$res[0]}{'sunday'}   = $res[6] || 1;
1.1       nick      135:                                $comicConfig{$res[0]}{'error'}    = 0;
                    136:                        }
                    137:                        elsif ( $_ =~ m/(.*)\s+=\s+(.*)/ ) {
                    138:                                $comicConfig{'configs'}{$1} = $2;
                    139:                        }
                    140:                }
                    141:        close (FILEN);
                    142: 
                    143:        return %comicConfig;
                    144: }
                    145: 
                    146: #######################################################################
                    147: #######################################################################
                    148: sub writeComic ($$) {
                    149:        my ( $comics, $comic, $date ) = @_;
1.11      nick      150:        my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12      nick      151:        my $indexFile = $indexDir . "/index-" . $date->{'year2'} . 
                    152:                        $date->{'mon2'} . $date->{'day2'} . "-" . 
                    153:                        $sd . ".html";
1.1       nick      154:        my $content = <<EOF;
                    155: 
                    156: <!-- ********* Begin $comic ($comics->{$comic}{'fullName'}) ******* -->
                    157:   <tr>
                    158:     <td align="left">
                    159: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font> &nbsp; &nbsp; 
                    160: <font size="-2">
                    161:        <a href="$comics->{$comic}{'url'}">
                    162:                $comics->{$comic}{'url'}
                    163:        </a>
                    164: </font><br/>
1.17      nick      165: <img src="../images/$date->{'mon2'}$date->{'year2'}/$comic-$date->{'day2'}.$comics->{$comic}{'ext'}" alt="$comic-$date->{'day2'}" />
1.1       nick      166: <br/><br/>
                    167: </td></tr>
                    168: <!-- ********* Finish $comic ($comics->{$comic}{'fullName'}) ******* -->
                    169: 
                    170: EOF
                    171:        open INDEX, ">>$indexFile";
                    172: 
                    173:        print INDEX $content if ( ! $comics->{$comic}{'error'} );
                    174: 
                    175:        print INDEX <<EOF
                    176: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font> &nbsp; &nbsp;
                    177: <font size="-2"><
                    178:         <a href="$comics->{$comic}{'url'}">
                    179:                 $comics->{$comic}{'url'}
                    180:         </a>
                    181: </font><br/>
                    182: <font color="red"><b>$comic :  $comics->{$comic}{'error'}</b></font><br/>
                    183:   </td>
                    184: </tr>
                    185: EOF
                    186:                if ( $comics->{$comic}{'error'} );
                    187: 
                    188:        close (INDEX);
                    189: 
                    190:        return 0;
                    191: }
                    192: 
                    193: 
                    194: #######################################################################
                    195: #######################################################################
                    196: sub writeMainIndex ($$) {
                    197:        my ( $date ) = @_;
                    198: 
                    199: }
                    200: 
                    201: 
                    202: #######################################################################
                    203: #######################################################################
                    204: sub writeFooter {
                    205:        my ( $date ) = @_;
1.11      nick      206:        my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12      nick      207:        my $indexFile = $indexDir . "/index-" . $date->{'year2'} . 
                    208:                        $date->{'mon2'} . $date->{'day2'} . "-" . 
                    209:                        $sd . ".html";
1.1       nick      210:        my $sysDate = `date`;
                    211: 
                    212:        open INDEX, ">>$indexFile";
                    213:        print INDEX <<EOF;
                    214: </table>
1.3       nick      215: <center>
                    216: <font size="2">
                    217: Generated on: <font color="green">$sysDate</font><br/>
1.7       nick      218: Version: <font color="green">$ver</font><br />
                    219: CVS: <a href="http://demandred.dyndns.org/cgi-bin/cvsweb/comics/">http://demandred.dyndns.org/cgi-bin/cvsweb/comics/</a>
1.1       nick      220:   <p>
                    221:     <a href="http://validator.w3.org/check?uri=referer"><img
                    222:       src="http://www.w3.org/Icons/valid-xhtml10-blue" alt="Valid XHTML 1.0 Transitional" height="31" width="88" border="0" /></a>
                    223:   </p>
                    224: </center>
                    225: 
                    226: </body>
                    227: </html>
                    228: EOF
                    229:        close( INDEX );
                    230: }
                    231: 
                    232: #######################################################################
                    233: #######################################################################
                    234: sub checkDir ($$) {
                    235:        my @dir = @_;
                    236: 
                    237:        foreach ( @dir ) {
                    238:                if ( ! -d $_ ) { mkpath( $_ ); }
                    239:        }
                    240: }
                    241: 
                    242: #######################################################################
                    243: #######################################################################
                    244: sub writeTitle ($$) {
                    245:        my ( $date ) = @_;
1.11      nick      246:        my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12      nick      247:        my $indexFile = $indexDir . "/index-" . $date->{'year2'} . 
                    248:                        $date->{'mon2'} . $date->{'day2'} . "-" . 
                    249:                        $sd . ".html";
1.8       nick      250:        my $today     = $days[$date->{'dow'}] . " " . $date->{'mon'} . "/" . $date->{'day'} . "/" . $date->{'year'};
1.16      nick      251:     my $today_long = Date_to_Text_Long(Today());
1.1       nick      252: 
                    253:        open INDEX, ">$indexFile";
                    254:        print INDEX <<EOF;
                    255: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
                    256: 
                    257: <html xmlns="http://www.w3.org/1999/xhtml">
                    258: <head>
                    259: <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
1.13      nick      260: <link href="/daily/comics/comics.css" type="text/css" rel="stylesheet" media="screen">
1.18    ! nick      261: <link rel="shortcut icon" href="./favicon.ico">
1.1       nick      262:     <title>Daily Comics for $today</title>
                    263:   </head>
                    264: <body bgcolor="#FFFFFF">
                    265: <table align="center" cellpadding="5" cellspacing="0">
1.16      nick      266: <tr><td>
                    267: <table cellpadding="0" cellspacing="0" border="0">
                    268: <tr><td align="Left"><img src="images/daily_comics_heading01.png"></td></tr>
                    269: <tr><td align="left">$today_long</td></tr>
                    270: <tr><td>&nbsp;</td></tr>
                    271: </td</tr>
                    272: 
1.1       nick      273: EOF
                    274:        close (INDEX);
                    275: }
                    276: 
                    277: #######################################################################
                    278: #######################################################################
                    279: sub directDownload ($$) {
                    280:        my ( $comics, $comic, $date ) = @_;
                    281:        my $file = &parseComic ( $comics, $comic, $date );
                    282: 
                    283:         ##
                    284:         ## Save the file to the appropriate directory
                    285:         ##
                    286:         my $cDir  = $date->{'mon2'} . $date->{'year2'};
                    287:         my $cDate = $date->{'day2'};
                    288: 
1.18    ! nick      289:        my $cmd = "wget -q $file --referer=\"" . $comics->{$comic}{'url'} ."\" --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg";
1.14      nick      290: 
1.1       nick      291:         return system($cmd);
                    292: }
                    293: 
                    294: #######################################################################
                    295: #######################################################################
                    296: sub indexDownload ($$) {
                    297:        my ( $comics, $comic, $date ) = @_;
                    298:        my ( @lines, $comicLine, $mainURL );
                    299:        my $comicIndex = "indexes/index.$comic";
                    300: 
                    301:        `wget -q $comics->{$comic}{'url'} -O $comicIndex`;
                    302: 
                    303:        if ( ! open FILEN, "<$comicIndex" ) {  
                    304:                return "ERROR: Can't open index file for " . $comics->{$comic}{'fullName'} . 
                    305:                       " (" . $comics->{$comic}{'url'} . ")"; 
                    306:        } 
                    307:                @lines = <FILEN>;
                    308:        close (FILEN);  
                    309: 
                    310:        unlink ("$comicIndex");
                    311: 
                    312:        $mainURL = $comics->{$comic}{'url'};
                    313:        ## I need to figure out how to merge these two in to one regex.
                    314:        $mainURL =~ s/(http:\/\/.*)(?:\/.*\/){1,}.*/$1/;
                    315:        $mainURL =~ s/([a-z])\/.*/$1/i;
                    316: 
                    317:        ##
                    318:        ## Find the comic strip URL based on the specified regex in the search
                    319:        ##
                    320:        foreach my $line (@lines) {
1.17      nick      321:                if ( $line =~ m/$comics->{$comic}{'search'}/i ) {
1.1       nick      322:                        $comicLine = $1; chomp $comicLine;
                    323:                }
1.17      nick      324:     }
1.1       nick      325: 
                    326:        ##
                    327:        ## Save the file to the appropriate directory
                    328:        ##
                    329:        my $cDir    = $date->{'mon2'} . $date->{'year2'};
                    330:        my $cDate   = $date->{'day2'};
                    331: 
                    332:        if ( $comicLine ) {
                    333:                if ( $comicLine =~ m/(gif|jpg|png)/i ) { $comics->{$comic}{'ext'} = $1; }
                    334:                my $comicURL = ( $comicLine =~ m/http/ ) ? $comicLine : $mainURL . $comicLine;
1.17      nick      335:                my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer=\"" . $comics->{$comic}{'url'} . "\" -q $comicURL -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}";
1.1       nick      336:                system( $cmd );
                    337:                return 0;
                    338:        }
                    339: 
                    340:        unlink "index.html";
                    341: 
                    342:        return "ERROR: Could not download comic $comics->{$comic}{'fullName'}";
                    343: }
                    344: 
                    345: #######################################################################
                    346: #######################################################################
                    347: sub parseComic ($$) {
                    348:        my ( $comics, $comic, $date ) = @_;
                    349:        my $string = $comics->{$comic}{'search'};
                    350: 
                    351:        $string =~ s/__year__/$date->{'year'}/g;
                    352:        $string =~ s/__year2__/$date->{'year2'}/g;
                    353:        $string =~ s/__mon__/$date->{'mon'}/g;
                    354:        $string =~ s/__mon2__/$date->{'mon2'}/g;
                    355:        $string =~ s/__day__/$date->{'day'}/g;
                    356:        $string =~ s/__day2__/$date->{'day2'}/g;
                    357:        $string =~ s/__ext__/$comics->{$comic}{'ext'}/g;
                    358:        chomp $string;
                    359: 
                    360:        return $string;
                    361: }
                    362: 
                    363: #######################################################################
                    364: #######################################################################
                    365: sub fetchDates () {
                    366:        my %dates = ();
                    367: 
1.8       nick      368:        ($dates{'day'}, $dates{'mon'}, $dates{'year'}, $dates{'dow'}) = (localtime(time - (86400 * $days_ago )))[3,4,5,6];
1.1       nick      369: 
                    370:        $dates{'year'} += 1900;
                    371:        $dates{'year2'} = substr $dates{'year'}, 2, 2;
                    372:        $dates{'day2'}  = ( $dates{'day'} < 10 ) ? "0" . $dates{'day'} : $dates{'day'}; 
                    373:        $dates{'mon'}++;
                    374:        $dates{'mon2'}  = ( $dates{'mon'} < 10 ) ? "0".$dates{'mon'} : $dates{'mon'};
                    375: 
                    376:        return %dates;
                    377: }
1.8       nick      378: 
                    379: ###############################################################################
                    380: ##
                    381: ## &fetchOptions( );
                    382: ##
                    383: ##      Grab our command line arguments and toss them in to a hash
                    384: ##
                    385: ###############################################################################
                    386: sub fetchOptions {
                    387:         my %opts;
                    388: 
                    389:         &GetOptions(
                    390:                         "days:i"        => \$opts{'days'},
                    391:                         "help|?"        => \$opts{'help'},
                    392:                         "man"           => \$opts{'man'},
                    393:                    ) || &pod2usage( );
                    394:         &pod2usage( ) if defined $opts{'help'};
                    395:         &pod2usage( { -verbose => 2, -input => \*DATA } ) if defined $opts{'man'};
                    396: 
                    397:         return %opts;
                    398: }
                    399: 
                    400: __END__
                    401: 
                    402: =head1 NAME
                    403: 
                    404: fetch.pl - Fetches comics and places them all locally in a single html file.
                    405: 
                    406: =head1 SYNOPSIS
                    407: 
                    408: fetch.pl [options]
                    409: 
                    410: Options:
                    411:         --days,d        Fetch comics from X days ago
                    412:         --help,?        Display the basic help menu
                    413:         --man,m         Display the detailed man page
                    414: 
                    415: =head1 DESCRIPTION
                    416: 
                    417: =head1 HISTORY
                    418: 
                    419: =head1 AUTHOR
                    420: 
                    421: Nicholas DeClario <nick@declario.com>
                    422: 
                    423: =head1 BUGS
                    424: 
                    425: This is a work in progress.  Please report all bugs to the author.
                    426: 
                    427: =head1 SEE ALSO
                    428: 
                    429: =head1 COPYRIGHT
                    430: 
                    431: =cut
                    432: 
                    433: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>