Annotation of comics/fetch.pl.new, revision 1.27
1.1 nick 1: #!/usr/bin/perl -w
2:
1.15 nick 3: ###############################################################################
1.16 nick 4: # $Log: fetch.pl.new,v $
1.27 ! nick 5: # Revision 1.26 2018/04/22 14:03:54 nick
! 6: # Changed the default for Sunday comics that was causing issues with some comics.
! 7: #
1.26 nick 8: # Revision 1.25 2018/02/12 13:30:58 nick
9: # Added an easier to compare date string to determine if the status json file was updated today and report if it wasn't.
10: #
1.25 nick 11: # Revision 1.24 2018/02/06 14:31:06 nick
12: # A status report is now generated in JSON that can easily be scanned so that
13: # I can be alerted when there are failures that I miss if I don't read the
14: # comics that day.
15: #
1.24 nick 16: # Revision 1.23 2018/01/26 13:05:27 nick
17: # Added a new config option to remove all newline from the resulting index.html
18: # file. This allows for easier parsing for certain comics. I then updated
19: # the URLs to search for and enabled the newline removal for a handful
20: # of uComics.
21: #
22: # I believe I've also properly fixed the Comic Config version displayed on
23: # the webpage itself.
24: #
1.23 nick 25: # Revision 1.22 2017/12/05 13:37:40 nick
26: # Added the CVS config version to the outpuit.
27: #
1.22 nick 28: # Revision 1.21 2015/10/26 14:25:40 nick
29: # Fixed a bug that was improperly including the day of week string preventing the weekend comics from fetching proproperly.
30: #
1.21 nick 31: # Revision 1.20 2015/10/22 12:58:44 nick
32: # Added the ability for Sunday only comics. Stonesoup is no longer weekdays, this has been added to Sunday only. I also added Foxtrot Classics for weekdays and Foxtrot for Sundays.
33: #
1.20 nick 34: # Revision 1.19 2015/07/13 12:56:58 nick
35: # Added Sally Forth and Pearls Before Swine. Adding Sally Forth required a change in the 'wget' command for fetching the index file to include 'user-agent' and 'referer'.
36: #
1.19 nick 37: # Revision 1.18 2015/05/07 12:31:43 nick
38: # Added favicon
39: #
1.18 nick 40: # Revision 1.17 2015/02/19 14:56:10 nick
41: # Fixed a problem that forced everything to JPG. This would kill GIF animations, but would not display the gifs either because 'convert' appends an index number to the end of the file name for each from of the GIF animation. I fixed this to maintain GIF compatibilty as well as rewritting how the script fetches the size of the file. Additionally, I updated the configuration for Questionable Content to search for GIF or JPG, which is what triggered this entire update.
42: #
1.17 nick 43: # Revision 1.16 2015/02/05 18:05:58 nick
44: # Changed the background and added a fancy title.
45: #
1.16 nick 46: # Revision 1.15 2015/01/19 13:46:19 nick
47: # *** empty log message ***
48: #
1.15 nick 49: ###############################################################################
50:
1.1 nick 51: use strict;
52: use File::Path;
53: use Data::Dumper;
1.8 nick 54: use Pod::Usage;
55: use Getopt::Long;
1.24 nick 56: use JSON::Create 'create_json';
1.21 nick 57: use Date::Calc qw/Date_to_Text_Long Today Day_of_Week Day_of_Week_to_Text/;
1.16 nick 58:
1.1 nick 59: ##
60: ## Some default values
61: ##
1.27 ! nick 62: my $ver = '$Id: fetch.pl.new,v 1.26 2018/04/22 14:03:54 nick Exp $';
1.1 nick 63: my $comicFile = "comics.conf";
1.22 nick 64: my $comicConfigVer = "Unknown";
1.24 nick 65: my $reportFile = "/home/httpd/html/daily/comics/status_report.json";
1.1 nick 66: my %comics = &readComicConfig ( $comicFile );
1.8 nick 67: my %opts = &fetchOptions( );
68: my $days_ago = $opts{'days'} || 0;
1.1 nick 69: my %dates = &fetchDates();
70: my $baseDir = $comics{'configs'}{'base_directory'} || ".";
71: my $imageDir = $baseDir . "/" . ( $comics{'configs'}{'image_directory'} || "images" ) .
72: "/$dates{'mon2'}$dates{'year2'}";
73: my $indexDir = $baseDir . "/" . ( $comics{'configs'}{'index_directory'} || "indexes" );
1.2 nick 74: my $USER_AGENT = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18) Gecko/20110628 Ubuntu/10.10 (maverick) Firefox/3.6.18";
1.8 nick 75: my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
1.1 nick 76:
77: my $DATE=`date`; chomp $DATE;
78: print STDOUT "Starting comic fetch at $DATE\n";
79:
80: ##
81: ## Main program starts here
82: ##
83: &checkDir ( [ $imageDir, $indexDir ] );
84:
1.5 nick 85: &writeTitle ( \%dates );
1.1 nick 86:
87: foreach my $comic ( sort keys %comics ) {
1.20 nick 88:
89: ## Skip if this is Sunday and the comic is weekdays only
1.1 nick 90: next if ( $comic =~ m/config/ );
1.21 nick 91: if (($dates{'wday'} eq "Sunday") &&
1.26 nick 92: ($comics{$comic}{'not_sunday'} == 1)) {
1.20 nick 93: print "Skipping '$comic'; Weekdays only.\n";
94: next;
95: }
96:
97: ## Skip if Sunday only comic and it's not Sunday.
1.21 nick 98: if (($dates{'wday'} ne "Sunday") &&
1.20 nick 99: ($comics{$comic}{'sunday_only'} == 1)) {
1.21 nick 100: print "Skipping '$comic' ($comics{$comic}{'sunday_only'}); Sunday only.\n";
1.20 nick 101: next
102: }
1.26 nick 103:
1.1 nick 104: $comics{$comic}{'error'} = &downloadComic ( \%comics, $comic, \%dates );
105: &writeComic ( \%comics, $comic, \%dates );
106:
1.17 nick 107: my $file = "$imageDir/$comic-$dates{'day2'}.$comics{$comic}{'ext'}";
108: my $size = 0;
109:
110: my $cmd = "/usr/bin/identify -verbose $file|";
111: open(IMG, $cmd) || die ("Can't open: $!\n");
112: while(<IMG>) {
113: if ($_ =~ m/^\s+geometry:\s+(\d+)x\d+.*/i) {
114: $size = $1 if ( $size == 0);
115: }
116: }
117: close(IMG);
1.4 nick 118:
1.19 nick 119:
1.4 nick 120: system( "/usr/bin/convert -resize 640 $file $file" )
121: if ( $size > 640 )
122: }
123:
1.1 nick 124: ## &writeMainIndex ( \%dates );
125:
126: &writeFooter( \%dates );
127:
1.24 nick 128: print STDOUT "Status written to $reportFile.\n"
129: if (&writeStatusReportJSON(\%comics, $reportFile));
130:
1.1 nick 131: $DATE=`date`; chomp( $DATE );
132: print STDOUT "Completed comic fetch at $DATE\n";
133:
134: ## End
135:
136: #######################################################################
137: ## Function : downloadComic
138: ##
139: ## Description :
140: ## This function determines the download method being used to
141: ## retrieve the comic and calls the apprioriate function.
142: ##
143: ## If the mode is invalid an error will be returned.
144: ##
145: #######################################################################
146: sub downloadComic ($$) {
147: my ( $comics, $comic, $date ) = @_;
148:
149: SWITCH: {
150: if ( $comics->{$comic}{'mode'} eq 1 ) {
151: return indexDownload ( \%comics, $comic, $date );
152: last SWITCH;
153: }
154: if ( $comics->{$comic}{'mode'} eq 2 ) {
155: return directDownload ( \%comics, $comic, $date );
156: last SWITCH;
157: }
158: }
159:
160: return "ERROR: Unknown download method specified for $comics->{$comic}{'fullName'}.";
161: }
162:
163: #######################################################################
164: #######################################################################
165: sub readComicConfig ($$) {
166: my ( $comicFile ) = @_;
167: my %comicConfig = ( );
168: my %config = ( );
169:
1.14 nick 170: my ($year, $mon, $day) =( localtime(time))[5,4,3];
171: $year += 1900;
172: $mon = sprintf("%02d", ($mon + 1));
173: $day = sprintf("%02d", $day);
174:
1.1 nick 175: open FILEN, "<$comicFile";
176: while (<FILEN>) {
1.24 nick 177: #if ($_ =~ m/^#.* \$[Ii][Dd]: fetch.pl.new,v 1.23 2018/01/26 13:05:27 nick Exp $/) {
178: if ($_ =~ m/^#.* \$[Ii][dD]: .*,v\ (.*)\ \d{4}\/.*\$$/) {
1.22 nick 179: $comicConfigVer = $1;
180: }
1.1 nick 181: if ( ( $_ !~ m/^#/ ) && ( $_ =~ m/,.*,/) ){
1.14 nick 182: $_ =~ s/__YEAR__/$year/g;
183: $_ =~ s/__MON__/$mon/g;
184: $_ =~ s/__DAY__/$day/g;
185:
1.1 nick 186: my @res = split /,/, $_;
187: $comicConfig{$res[0]}{'url'} = $res[1];
188: $comicConfig{$res[0]}{'search'} = $res[2];
189: $comicConfig{$res[0]}{'mode'} = $res[3];
190: $comicConfig{$res[0]}{'fullName'} = $res[4];
191: $comicConfig{$res[0]}{'ext'} = $res[5];
1.26 nick 192: $comicConfig{$res[0]}{'not_sunday'} = sprintf("%d", $res[6] || 0);
1.21 nick 193: $comicConfig{$res[0]}{'sunday_only'} = sprintf("%d", $res[7] || 0);
1.23 nick 194: $comicConfig{$res[0]}{'remove_newlines'} = sprintf("%d", $res[8] || 0);
1.1 nick 195: $comicConfig{$res[0]}{'error'} = 0;
196: }
197: elsif ( $_ =~ m/(.*)\s+=\s+(.*)/ ) {
198: $comicConfig{'configs'}{$1} = $2;
199: }
200: }
201: close (FILEN);
202:
203: return %comicConfig;
204: }
205:
206: #######################################################################
207: #######################################################################
1.24 nick 208: sub writeStatusReportJSON ($$) {
209: my ( $comicsRef, $filename ) = @_;
210: my %comics = %$comicsRef;
1.25 nick 211: my $shortDate = sprintf("%d%02d%02d", (localtime)[5] + 1900,
212: (localtime)[4] + 1,
213: (localtime)[3]);
1.27 ! nick 214: my %json = ('date' => $shortDate, 'comics' => ());
1.24 nick 215: my $totalErrors = 0;
216:
217: foreach my $comic (sort keys %comics) {
218: next unless $comics{$comic}{'fullName'};
219: if ($comics{$comic}{'error'}) {
220: my %error = ('comicName' => "$comics{$comic}{'fullName'}",
221: 'error' => "$comics{$comic}{'error'}",
222: 'status' => "Error");
1.27 ! nick 223: push @{$json{'comics'}}, \%error;
1.24 nick 224: $totalErrors += 1;
225: } else {
226: my %status = ('comicName' => "$comics{$comic}{'fullName'}",
227: 'error' => 0,
228: 'status' => "Successfull");
1.27 ! nick 229: push @{$json{'comics'}}, \%status;
1.24 nick 230: }
231: }
232: $json{'totalErrors'} = $totalErrors;
233:
234: open SR, ">$filename" or die ("ERROR: Failed to create status report: $!\n");
235: print SR create_json (\%json);
236: close(SR);
237: }
238:
239: #######################################################################
240: #######################################################################
1.1 nick 241: sub writeComic ($$) {
242: my ( $comics, $comic, $date ) = @_;
1.11 nick 243: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 244: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
245: $date->{'mon2'} . $date->{'day2'} . "-" .
246: $sd . ".html";
1.1 nick 247: my $content = <<EOF;
248:
249: <!-- ********* Begin $comic ($comics->{$comic}{'fullName'}) ******* -->
250: <tr>
251: <td align="left">
252: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font>
253: <font size="-2">
254: <a href="$comics->{$comic}{'url'}">
255: $comics->{$comic}{'url'}
256: </a>
257: </font><br/>
1.17 nick 258: <img src="../images/$date->{'mon2'}$date->{'year2'}/$comic-$date->{'day2'}.$comics->{$comic}{'ext'}" alt="$comic-$date->{'day2'}" />
1.1 nick 259: <br/><br/>
260: </td></tr>
261: <!-- ********* Finish $comic ($comics->{$comic}{'fullName'}) ******* -->
262:
263: EOF
264: open INDEX, ">>$indexFile";
265:
266: print INDEX $content if ( ! $comics->{$comic}{'error'} );
267:
268: print INDEX <<EOF
269: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font>
270: <font size="-2"><
271: <a href="$comics->{$comic}{'url'}">
272: $comics->{$comic}{'url'}
273: </a>
274: </font><br/>
275: <font color="red"><b>$comic : $comics->{$comic}{'error'}</b></font><br/>
276: </td>
277: </tr>
278: EOF
279: if ( $comics->{$comic}{'error'} );
280:
281: close (INDEX);
282:
283: return 0;
284: }
285:
286:
287: #######################################################################
288: #######################################################################
289: sub writeMainIndex ($$) {
290: my ( $date ) = @_;
291:
292: }
293:
294:
295: #######################################################################
296: #######################################################################
297: sub writeFooter {
298: my ( $date ) = @_;
1.11 nick 299: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 300: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
301: $date->{'mon2'} . $date->{'day2'} . "-" .
302: $sd . ".html";
1.1 nick 303: my $sysDate = `date`;
304:
305: open INDEX, ">>$indexFile";
306: print INDEX <<EOF;
307: </table>
1.3 nick 308: <center>
309: <font size="2">
310: Generated on: <font color="green">$sysDate</font><br/>
1.7 nick 311: Version: <font color="green">$ver</font><br />
1.22 nick 312: Config Version: <font color="green">$comicConfigVer</font><br />
1.7 nick 313: CVS: <a href="http://demandred.dyndns.org/cgi-bin/cvsweb/comics/">http://demandred.dyndns.org/cgi-bin/cvsweb/comics/</a>
1.1 nick 314: <p>
315: <a href="http://validator.w3.org/check?uri=referer"><img
316: src="http://www.w3.org/Icons/valid-xhtml10-blue" alt="Valid XHTML 1.0 Transitional" height="31" width="88" border="0" /></a>
317: </p>
318: </center>
319:
320: </body>
321: </html>
322: EOF
323: close( INDEX );
324: }
325:
326: #######################################################################
327: #######################################################################
328: sub checkDir ($$) {
329: my @dir = @_;
330:
331: foreach ( @dir ) {
332: if ( ! -d $_ ) { mkpath( $_ ); }
333: }
334: }
335:
336: #######################################################################
337: #######################################################################
338: sub writeTitle ($$) {
339: my ( $date ) = @_;
1.11 nick 340: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 341: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
342: $date->{'mon2'} . $date->{'day2'} . "-" .
343: $sd . ".html";
1.8 nick 344: my $today = $days[$date->{'dow'}] . " " . $date->{'mon'} . "/" . $date->{'day'} . "/" . $date->{'year'};
1.16 nick 345: my $today_long = Date_to_Text_Long(Today());
1.1 nick 346:
347: open INDEX, ">$indexFile";
348: print INDEX <<EOF;
349: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
350:
351: <html xmlns="http://www.w3.org/1999/xhtml">
352: <head>
353: <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
1.13 nick 354: <link href="/daily/comics/comics.css" type="text/css" rel="stylesheet" media="screen">
1.18 nick 355: <link rel="shortcut icon" href="./favicon.ico">
1.1 nick 356: <title>Daily Comics for $today</title>
357: </head>
358: <body bgcolor="#FFFFFF">
359: <table align="center" cellpadding="5" cellspacing="0">
1.16 nick 360: <tr><td>
361: <table cellpadding="0" cellspacing="0" border="0">
362: <tr><td align="Left"><img src="images/daily_comics_heading01.png"></td></tr>
363: <tr><td align="left">$today_long</td></tr>
364: <tr><td> </td></tr>
365: </td</tr>
366:
1.1 nick 367: EOF
368: close (INDEX);
369: }
370:
371: #######################################################################
372: #######################################################################
373: sub directDownload ($$) {
374: my ( $comics, $comic, $date ) = @_;
375: my $file = &parseComic ( $comics, $comic, $date );
376:
377: ##
378: ## Save the file to the appropriate directory
379: ##
380: my $cDir = $date->{'mon2'} . $date->{'year2'};
381: my $cDate = $date->{'day2'};
382:
1.27 ! nick 383: my $cmd = "wget -q $file --referer='" . $comics->{$comic}{'url'} ."' --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg";
1.14 nick 384:
1.1 nick 385: return system($cmd);
386: }
387:
388: #######################################################################
389: #######################################################################
390: sub indexDownload ($$) {
391: my ( $comics, $comic, $date ) = @_;
392: my ( @lines, $comicLine, $mainURL );
393: my $comicIndex = "indexes/index.$comic";
394:
1.27 ! nick 395: my $wget_cmd = "wget -q --referer='$comics->{$comic}{'url'}' " .
1.19 nick 396: "--user-agent=\"$USER_AGENT\" " .
397: "$comics->{$comic}{'url'} -O $comicIndex";
398: system($wget_cmd);
1.1 nick 399:
400: if ( ! open FILEN, "<$comicIndex" ) {
401: return "ERROR: Can't open index file for " . $comics->{$comic}{'fullName'} .
402: " (" . $comics->{$comic}{'url'} . ")";
403: }
1.23 nick 404: while (<FILEN>) {
405: my $line = $_;
1.27 ! nick 406: $line =~ s/\R|\ \ +|\t//g if ( $comics->{$comic}{'remove_newlines'} );
1.23 nick 407: push @lines, $line;
408: }
1.1 nick 409: close (FILEN);
410:
1.27 ! nick 411:
1.1 nick 412: unlink ("$comicIndex");
413:
414: $mainURL = $comics->{$comic}{'url'};
415: ## I need to figure out how to merge these two in to one regex.
416: $mainURL =~ s/(http:\/\/.*)(?:\/.*\/){1,}.*/$1/;
417: $mainURL =~ s/([a-z])\/.*/$1/i;
418:
419: ##
420: ## Find the comic strip URL based on the specified regex in the search
421: ##
1.27 ! nick 422:
1.1 nick 423: foreach my $line (@lines) {
1.17 nick 424: if ( $line =~ m/$comics->{$comic}{'search'}/i ) {
1.1 nick 425: $comicLine = $1; chomp $comicLine;
426: }
1.17 nick 427: }
1.1 nick 428:
429: ##
430: ## Save the file to the appropriate directory
431: ##
432: my $cDir = $date->{'mon2'} . $date->{'year2'};
433: my $cDate = $date->{'day2'};
434:
435: if ( $comicLine ) {
436: if ( $comicLine =~ m/(gif|jpg|png)/i ) { $comics->{$comic}{'ext'} = $1; }
437: my $comicURL = ( $comicLine =~ m/http/ ) ? $comicLine : $mainURL . $comicLine;
1.27 ! nick 438: # Strip &
! 439: $comicURL =~ s/\&\;/&/g;
! 440: my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer='" . $comics->{$comic}{'url'} . "' -q '$comicURL' -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}";
1.1 nick 441: system( $cmd );
442: return 0;
443: }
444:
445: unlink "index.html";
446:
447: return "ERROR: Could not download comic $comics->{$comic}{'fullName'}";
448: }
449:
450: #######################################################################
451: #######################################################################
452: sub parseComic ($$) {
453: my ( $comics, $comic, $date ) = @_;
454: my $string = $comics->{$comic}{'search'};
455:
456: $string =~ s/__year__/$date->{'year'}/g;
457: $string =~ s/__year2__/$date->{'year2'}/g;
458: $string =~ s/__mon__/$date->{'mon'}/g;
459: $string =~ s/__mon2__/$date->{'mon2'}/g;
460: $string =~ s/__day__/$date->{'day'}/g;
461: $string =~ s/__day2__/$date->{'day2'}/g;
462: $string =~ s/__ext__/$comics->{$comic}{'ext'}/g;
463: chomp $string;
464:
465: return $string;
466: }
467:
468: #######################################################################
469: #######################################################################
470: sub fetchDates () {
471: my %dates = ();
472:
1.8 nick 473: ($dates{'day'}, $dates{'mon'}, $dates{'year'}, $dates{'dow'}) = (localtime(time - (86400 * $days_ago )))[3,4,5,6];
1.1 nick 474:
475: $dates{'year'} += 1900;
476: $dates{'year2'} = substr $dates{'year'}, 2, 2;
477: $dates{'day2'} = ( $dates{'day'} < 10 ) ? "0" . $dates{'day'} : $dates{'day'};
478: $dates{'mon'}++;
479: $dates{'mon2'} = ( $dates{'mon'} < 10 ) ? "0".$dates{'mon'} : $dates{'mon'};
1.21 nick 480: my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
481: $dates{'wday'} = $days[$dates{'dow'}];
1.1 nick 482:
483: return %dates;
484: }
1.8 nick 485:
486: ###############################################################################
487: ##
488: ## &fetchOptions( );
489: ##
490: ## Grab our command line arguments and toss them in to a hash
491: ##
492: ###############################################################################
493: sub fetchOptions {
494: my %opts;
495:
496: &GetOptions(
497: "days:i" => \$opts{'days'},
498: "help|?" => \$opts{'help'},
499: "man" => \$opts{'man'},
500: ) || &pod2usage( );
501: &pod2usage( ) if defined $opts{'help'};
502: &pod2usage( { -verbose => 2, -input => \*DATA } ) if defined $opts{'man'};
503:
504: return %opts;
505: }
506:
507: __END__
508:
509: =head1 NAME
510:
511: fetch.pl - Fetches comics and places them all locally in a single html file.
512:
513: =head1 SYNOPSIS
514:
515: fetch.pl [options]
516:
517: Options:
518: --days,d Fetch comics from X days ago
519: --help,? Display the basic help menu
520: --man,m Display the detailed man page
521:
522: =head1 DESCRIPTION
523:
524: =head1 HISTORY
525:
526: =head1 AUTHOR
527:
528: Nicholas DeClario <nick@declario.com>
529:
530: =head1 BUGS
531:
532: This is a work in progress. Please report all bugs to the author.
533:
534: =head1 SEE ALSO
535:
536: =head1 COPYRIGHT
537:
538: =cut
539:
540:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>