Annotation of comics/fetch.pl.new, revision 1.29
1.1 nick 1: #!/usr/bin/perl -w
2:
1.15 nick 3: ###############################################################################
1.16 nick 4: # $Log: fetch.pl.new,v $
1.29 ! nick 5: # Revision 1.28 2020/06/10 21:14:31 nick
! 6: # Updated for w3 validation.
! 7: #
1.28 nick 8: # Revision 1.27 2019/04/15 12:50:23 nick
9: # The script was unable to handle html '&' and convert it, so I added that. I probably should see if there's a library or something that handles all those automagically but I just tossed a regex in there for now that does the trick.
10: #
1.27 nick 11: # Revision 1.26 2018/04/22 14:03:54 nick
12: # Changed the default for Sunday comics that was causing issues with some comics.
13: #
1.26 nick 14: # Revision 1.25 2018/02/12 13:30:58 nick
15: # Added an easier to compare date string to determine if the status json file was updated today and report if it wasn't.
16: #
1.25 nick 17: # Revision 1.24 2018/02/06 14:31:06 nick
18: # A status report is now generated in JSON that can easily be scanned so that
19: # I can be alerted when there are failures that I miss if I don't read the
20: # comics that day.
21: #
1.24 nick 22: # Revision 1.23 2018/01/26 13:05:27 nick
23: # Added a new config option to remove all newline from the resulting index.html
24: # file. This allows for easier parsing for certain comics. I then updated
25: # the URLs to search for and enabled the newline removal for a handful
26: # of uComics.
27: #
28: # I believe I've also properly fixed the Comic Config version displayed on
29: # the webpage itself.
30: #
1.23 nick 31: # Revision 1.22 2017/12/05 13:37:40 nick
32: # Added the CVS config version to the outpuit.
33: #
1.22 nick 34: # Revision 1.21 2015/10/26 14:25:40 nick
35: # Fixed a bug that was improperly including the day of week string preventing the weekend comics from fetching proproperly.
36: #
1.21 nick 37: # Revision 1.20 2015/10/22 12:58:44 nick
38: # Added the ability for Sunday only comics. Stonesoup is no longer weekdays, this has been added to Sunday only. I also added Foxtrot Classics for weekdays and Foxtrot for Sundays.
39: #
1.20 nick 40: # Revision 1.19 2015/07/13 12:56:58 nick
41: # Added Sally Forth and Pearls Before Swine. Adding Sally Forth required a change in the 'wget' command for fetching the index file to include 'user-agent' and 'referer'.
42: #
1.19 nick 43: # Revision 1.18 2015/05/07 12:31:43 nick
44: # Added favicon
45: #
1.18 nick 46: # Revision 1.17 2015/02/19 14:56:10 nick
47: # Fixed a problem that forced everything to JPG. This would kill GIF animations, but would not display the gifs either because 'convert' appends an index number to the end of the file name for each from of the GIF animation. I fixed this to maintain GIF compatibilty as well as rewritting how the script fetches the size of the file. Additionally, I updated the configuration for Questionable Content to search for GIF or JPG, which is what triggered this entire update.
48: #
1.17 nick 49: # Revision 1.16 2015/02/05 18:05:58 nick
50: # Changed the background and added a fancy title.
51: #
1.16 nick 52: # Revision 1.15 2015/01/19 13:46:19 nick
53: # *** empty log message ***
54: #
1.15 nick 55: ###############################################################################
56:
1.1 nick 57: use strict;
58: use File::Path;
59: use Data::Dumper;
1.8 nick 60: use Pod::Usage;
61: use Getopt::Long;
1.24 nick 62: use JSON::Create 'create_json';
1.21 nick 63: use Date::Calc qw/Date_to_Text_Long Today Day_of_Week Day_of_Week_to_Text/;
1.16 nick 64:
1.1 nick 65: ##
66: ## Some default values
67: ##
1.29 ! nick 68: my $ver = '$Id: fetch.pl.new,v 1.28 2020/06/10 21:14:31 nick Exp $';
1.1 nick 69: my $comicFile = "comics.conf";
1.22 nick 70: my $comicConfigVer = "Unknown";
1.24 nick 71: my $reportFile = "/home/httpd/html/daily/comics/status_report.json";
1.1 nick 72: my %comics = &readComicConfig ( $comicFile );
1.8 nick 73: my %opts = &fetchOptions( );
74: my $days_ago = $opts{'days'} || 0;
1.1 nick 75: my %dates = &fetchDates();
76: my $baseDir = $comics{'configs'}{'base_directory'} || ".";
77: my $imageDir = $baseDir . "/" . ( $comics{'configs'}{'image_directory'} || "images" ) .
78: "/$dates{'mon2'}$dates{'year2'}";
79: my $indexDir = $baseDir . "/" . ( $comics{'configs'}{'index_directory'} || "indexes" );
1.2 nick 80: my $USER_AGENT = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18) Gecko/20110628 Ubuntu/10.10 (maverick) Firefox/3.6.18";
1.8 nick 81: my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
1.1 nick 82:
83: my $DATE=`date`; chomp $DATE;
84: print STDOUT "Starting comic fetch at $DATE\n";
85:
86: ##
87: ## Main program starts here
88: ##
89: &checkDir ( [ $imageDir, $indexDir ] );
90:
1.5 nick 91: &writeTitle ( \%dates );
1.1 nick 92:
93: foreach my $comic ( sort keys %comics ) {
1.20 nick 94:
95: ## Skip if this is Sunday and the comic is weekdays only
1.1 nick 96: next if ( $comic =~ m/config/ );
1.21 nick 97: if (($dates{'wday'} eq "Sunday") &&
1.26 nick 98: ($comics{$comic}{'not_sunday'} == 1)) {
1.20 nick 99: print "Skipping '$comic'; Weekdays only.\n";
100: next;
101: }
102:
103: ## Skip if Sunday only comic and it's not Sunday.
1.21 nick 104: if (($dates{'wday'} ne "Sunday") &&
1.20 nick 105: ($comics{$comic}{'sunday_only'} == 1)) {
1.21 nick 106: print "Skipping '$comic' ($comics{$comic}{'sunday_only'}); Sunday only.\n";
1.20 nick 107: next
108: }
1.26 nick 109:
1.1 nick 110: $comics{$comic}{'error'} = &downloadComic ( \%comics, $comic, \%dates );
111: &writeComic ( \%comics, $comic, \%dates );
112:
1.17 nick 113: my $file = "$imageDir/$comic-$dates{'day2'}.$comics{$comic}{'ext'}";
114: my $size = 0;
115:
116: my $cmd = "/usr/bin/identify -verbose $file|";
117: open(IMG, $cmd) || die ("Can't open: $!\n");
118: while(<IMG>) {
119: if ($_ =~ m/^\s+geometry:\s+(\d+)x\d+.*/i) {
120: $size = $1 if ( $size == 0);
121: }
122: }
123: close(IMG);
1.4 nick 124:
1.19 nick 125:
1.4 nick 126: system( "/usr/bin/convert -resize 640 $file $file" )
127: if ( $size > 640 )
128: }
129:
1.1 nick 130: ## &writeMainIndex ( \%dates );
131:
132: &writeFooter( \%dates );
133:
1.24 nick 134: print STDOUT "Status written to $reportFile.\n"
135: if (&writeStatusReportJSON(\%comics, $reportFile));
136:
1.1 nick 137: $DATE=`date`; chomp( $DATE );
138: print STDOUT "Completed comic fetch at $DATE\n";
139:
140: ## End
141:
142: #######################################################################
143: ## Function : downloadComic
144: ##
145: ## Description :
146: ## This function determines the download method being used to
147: ## retrieve the comic and calls the apprioriate function.
148: ##
149: ## If the mode is invalid an error will be returned.
150: ##
151: #######################################################################
152: sub downloadComic ($$) {
153: my ( $comics, $comic, $date ) = @_;
154:
155: SWITCH: {
156: if ( $comics->{$comic}{'mode'} eq 1 ) {
157: return indexDownload ( \%comics, $comic, $date );
158: last SWITCH;
159: }
160: if ( $comics->{$comic}{'mode'} eq 2 ) {
161: return directDownload ( \%comics, $comic, $date );
162: last SWITCH;
163: }
164: }
165:
166: return "ERROR: Unknown download method specified for $comics->{$comic}{'fullName'}.";
167: }
168:
169: #######################################################################
170: #######################################################################
171: sub readComicConfig ($$) {
172: my ( $comicFile ) = @_;
173: my %comicConfig = ( );
174: my %config = ( );
175:
1.14 nick 176: my ($year, $mon, $day) =( localtime(time))[5,4,3];
177: $year += 1900;
178: $mon = sprintf("%02d", ($mon + 1));
179: $day = sprintf("%02d", $day);
180:
1.1 nick 181: open FILEN, "<$comicFile";
182: while (<FILEN>) {
1.24 nick 183: #if ($_ =~ m/^#.* \$[Ii][Dd]: fetch.pl.new,v 1.23 2018/01/26 13:05:27 nick Exp $/) {
184: if ($_ =~ m/^#.* \$[Ii][dD]: .*,v\ (.*)\ \d{4}\/.*\$$/) {
1.22 nick 185: $comicConfigVer = $1;
186: }
1.1 nick 187: if ( ( $_ !~ m/^#/ ) && ( $_ =~ m/,.*,/) ){
1.14 nick 188: $_ =~ s/__YEAR__/$year/g;
189: $_ =~ s/__MON__/$mon/g;
190: $_ =~ s/__DAY__/$day/g;
191:
1.1 nick 192: my @res = split /,/, $_;
193: $comicConfig{$res[0]}{'url'} = $res[1];
194: $comicConfig{$res[0]}{'search'} = $res[2];
195: $comicConfig{$res[0]}{'mode'} = $res[3];
196: $comicConfig{$res[0]}{'fullName'} = $res[4];
197: $comicConfig{$res[0]}{'ext'} = $res[5];
1.26 nick 198: $comicConfig{$res[0]}{'not_sunday'} = sprintf("%d", $res[6] || 0);
1.21 nick 199: $comicConfig{$res[0]}{'sunday_only'} = sprintf("%d", $res[7] || 0);
1.23 nick 200: $comicConfig{$res[0]}{'remove_newlines'} = sprintf("%d", $res[8] || 0);
1.1 nick 201: $comicConfig{$res[0]}{'error'} = 0;
202: }
203: elsif ( $_ =~ m/(.*)\s+=\s+(.*)/ ) {
204: $comicConfig{'configs'}{$1} = $2;
205: }
206: }
207: close (FILEN);
208:
209: return %comicConfig;
210: }
211:
212: #######################################################################
213: #######################################################################
1.24 nick 214: sub writeStatusReportJSON ($$) {
215: my ( $comicsRef, $filename ) = @_;
216: my %comics = %$comicsRef;
1.25 nick 217: my $shortDate = sprintf("%d%02d%02d", (localtime)[5] + 1900,
218: (localtime)[4] + 1,
219: (localtime)[3]);
1.27 nick 220: my %json = ('date' => $shortDate, 'comics' => ());
1.24 nick 221: my $totalErrors = 0;
222:
223: foreach my $comic (sort keys %comics) {
224: next unless $comics{$comic}{'fullName'};
225: if ($comics{$comic}{'error'}) {
226: my %error = ('comicName' => "$comics{$comic}{'fullName'}",
227: 'error' => "$comics{$comic}{'error'}",
228: 'status' => "Error");
1.27 nick 229: push @{$json{'comics'}}, \%error;
1.24 nick 230: $totalErrors += 1;
231: } else {
232: my %status = ('comicName' => "$comics{$comic}{'fullName'}",
233: 'error' => 0,
234: 'status' => "Successfull");
1.27 nick 235: push @{$json{'comics'}}, \%status;
1.24 nick 236: }
237: }
238: $json{'totalErrors'} = $totalErrors;
239:
240: open SR, ">$filename" or die ("ERROR: Failed to create status report: $!\n");
241: print SR create_json (\%json);
242: close(SR);
243: }
244:
245: #######################################################################
246: #######################################################################
1.1 nick 247: sub writeComic ($$) {
248: my ( $comics, $comic, $date ) = @_;
1.11 nick 249: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 250: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
251: $date->{'mon2'} . $date->{'day2'} . "-" .
252: $sd . ".html";
1.28 nick 253: $comics->{$comic}{'fullName'} =~ s/&/&/g;
1.1 nick 254: my $content = <<EOF;
255:
256: <!-- ********* Begin $comic ($comics->{$comic}{'fullName'}) ******* -->
257: <tr>
258: <td align="left">
259: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font>
260: <font size="-2">
261: <a href="$comics->{$comic}{'url'}">
262: $comics->{$comic}{'url'}
263: </a>
264: </font><br/>
1.17 nick 265: <img src="../images/$date->{'mon2'}$date->{'year2'}/$comic-$date->{'day2'}.$comics->{$comic}{'ext'}" alt="$comic-$date->{'day2'}" />
1.1 nick 266: <br/><br/>
267: </td></tr>
268: <!-- ********* Finish $comic ($comics->{$comic}{'fullName'}) ******* -->
269:
270: EOF
271: open INDEX, ">>$indexFile";
272:
273: print INDEX $content if ( ! $comics->{$comic}{'error'} );
274:
275: print INDEX <<EOF
276: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font>
277: <font size="-2"><
278: <a href="$comics->{$comic}{'url'}">
279: $comics->{$comic}{'url'}
280: </a>
281: </font><br/>
282: <font color="red"><b>$comic : $comics->{$comic}{'error'}</b></font><br/>
283: </td>
284: </tr>
285: EOF
286: if ( $comics->{$comic}{'error'} );
287:
288: close (INDEX);
289:
290: return 0;
291: }
292:
293:
294: #######################################################################
295: #######################################################################
296: sub writeMainIndex ($$) {
297: my ( $date ) = @_;
298:
299: }
300:
301:
302: #######################################################################
303: #######################################################################
304: sub writeFooter {
305: my ( $date ) = @_;
1.11 nick 306: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 307: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
308: $date->{'mon2'} . $date->{'day2'} . "-" .
309: $sd . ".html";
1.1 nick 310: my $sysDate = `date`;
311:
312: open INDEX, ">>$indexFile";
313: print INDEX <<EOF;
314: </table>
1.3 nick 315: <center>
1.28 nick 316: Generated on: <font size="2" color="green">$sysDate</font><br/>
317: Version: <font size="2" color="green">$ver</font><br />
318: Config Version: <font size="2" color="green">$comicConfigVer</font><br />
319: CVS: <a href="http://demandred.dyndns.org:3000/cgi-bin/cvsweb/comics/">http://demandred.dyndns.org/cgi-bin/cvsweb/comics/</a>
320: <br />
1.1 nick 321: <a href="http://validator.w3.org/check?uri=referer"><img
322: src="http://www.w3.org/Icons/valid-xhtml10-blue" alt="Valid XHTML 1.0 Transitional" height="31" width="88" border="0" /></a>
323: </center>
324:
325: </body>
326: </html>
327: EOF
328: close( INDEX );
329: }
330:
331: #######################################################################
332: #######################################################################
333: sub checkDir ($$) {
334: my @dir = @_;
335:
336: foreach ( @dir ) {
337: if ( ! -d $_ ) { mkpath( $_ ); }
338: }
339: }
340:
341: #######################################################################
342: #######################################################################
343: sub writeTitle ($$) {
344: my ( $date ) = @_;
1.11 nick 345: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 346: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
347: $date->{'mon2'} . $date->{'day2'} . "-" .
348: $sd . ".html";
1.8 nick 349: my $today = $days[$date->{'dow'}] . " " . $date->{'mon'} . "/" . $date->{'day'} . "/" . $date->{'year'};
1.16 nick 350: my $today_long = Date_to_Text_Long(Today());
1.1 nick 351:
352: open INDEX, ">$indexFile";
353: print INDEX <<EOF;
354: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
355:
356: <html xmlns="http://www.w3.org/1999/xhtml">
357: <head>
358: <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
1.28 nick 359: <link href="/daily/comics/comics.css" type="text/css" rel="stylesheet" media="screen" />
360: <link rel="shortcut icon" href="./favicon.ico" />
1.1 nick 361: <title>Daily Comics for $today</title>
362: </head>
363: <body bgcolor="#FFFFFF">
1.29 ! nick 364: <table align="center" cellpadding="0" cellspacing="0" border="0">
1.28 nick 365: <tr><td align="left"><img src="images/daily_comics_heading01.png" alt="Comic Page Heading" /></td></tr>
1.16 nick 366: <tr><td align="left">$today_long</td></tr>
367: <tr><td> </td></tr>
1.1 nick 368: EOF
369: close (INDEX);
370: }
371:
372: #######################################################################
373: #######################################################################
374: sub directDownload ($$) {
375: my ( $comics, $comic, $date ) = @_;
376: my $file = &parseComic ( $comics, $comic, $date );
377:
378: ##
379: ## Save the file to the appropriate directory
380: ##
381: my $cDir = $date->{'mon2'} . $date->{'year2'};
382: my $cDate = $date->{'day2'};
383:
1.27 nick 384: my $cmd = "wget -q $file --referer='" . $comics->{$comic}{'url'} ."' --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg";
1.14 nick 385:
1.1 nick 386: return system($cmd);
387: }
388:
389: #######################################################################
390: #######################################################################
391: sub indexDownload ($$) {
392: my ( $comics, $comic, $date ) = @_;
393: my ( @lines, $comicLine, $mainURL );
394: my $comicIndex = "indexes/index.$comic";
395:
1.27 nick 396: my $wget_cmd = "wget -q --referer='$comics->{$comic}{'url'}' " .
1.19 nick 397: "--user-agent=\"$USER_AGENT\" " .
398: "$comics->{$comic}{'url'} -O $comicIndex";
399: system($wget_cmd);
1.1 nick 400:
401: if ( ! open FILEN, "<$comicIndex" ) {
402: return "ERROR: Can't open index file for " . $comics->{$comic}{'fullName'} .
403: " (" . $comics->{$comic}{'url'} . ")";
404: }
1.23 nick 405: while (<FILEN>) {
406: my $line = $_;
1.27 nick 407: $line =~ s/\R|\ \ +|\t//g if ( $comics->{$comic}{'remove_newlines'} );
1.23 nick 408: push @lines, $line;
409: }
1.1 nick 410: close (FILEN);
411:
1.27 nick 412:
1.1 nick 413: unlink ("$comicIndex");
414:
415: $mainURL = $comics->{$comic}{'url'};
416: ## I need to figure out how to merge these two in to one regex.
417: $mainURL =~ s/(http:\/\/.*)(?:\/.*\/){1,}.*/$1/;
418: $mainURL =~ s/([a-z])\/.*/$1/i;
419:
420: ##
421: ## Find the comic strip URL based on the specified regex in the search
422: ##
1.27 nick 423:
1.1 nick 424: foreach my $line (@lines) {
1.17 nick 425: if ( $line =~ m/$comics->{$comic}{'search'}/i ) {
1.1 nick 426: $comicLine = $1; chomp $comicLine;
427: }
1.17 nick 428: }
1.1 nick 429:
430: ##
431: ## Save the file to the appropriate directory
432: ##
433: my $cDir = $date->{'mon2'} . $date->{'year2'};
434: my $cDate = $date->{'day2'};
435:
436: if ( $comicLine ) {
437: if ( $comicLine =~ m/(gif|jpg|png)/i ) { $comics->{$comic}{'ext'} = $1; }
438: my $comicURL = ( $comicLine =~ m/http/ ) ? $comicLine : $mainURL . $comicLine;
1.27 nick 439: # Strip &
440: $comicURL =~ s/\&\;/&/g;
441: my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer='" . $comics->{$comic}{'url'} . "' -q '$comicURL' -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}";
1.1 nick 442: system( $cmd );
443: return 0;
444: }
445:
446: unlink "index.html";
447:
448: return "ERROR: Could not download comic $comics->{$comic}{'fullName'}";
449: }
450:
451: #######################################################################
452: #######################################################################
453: sub parseComic ($$) {
454: my ( $comics, $comic, $date ) = @_;
455: my $string = $comics->{$comic}{'search'};
456:
457: $string =~ s/__year__/$date->{'year'}/g;
458: $string =~ s/__year2__/$date->{'year2'}/g;
459: $string =~ s/__mon__/$date->{'mon'}/g;
460: $string =~ s/__mon2__/$date->{'mon2'}/g;
461: $string =~ s/__day__/$date->{'day'}/g;
462: $string =~ s/__day2__/$date->{'day2'}/g;
463: $string =~ s/__ext__/$comics->{$comic}{'ext'}/g;
464: chomp $string;
465:
466: return $string;
467: }
468:
469: #######################################################################
470: #######################################################################
471: sub fetchDates () {
472: my %dates = ();
473:
1.8 nick 474: ($dates{'day'}, $dates{'mon'}, $dates{'year'}, $dates{'dow'}) = (localtime(time - (86400 * $days_ago )))[3,4,5,6];
1.1 nick 475:
476: $dates{'year'} += 1900;
477: $dates{'year2'} = substr $dates{'year'}, 2, 2;
478: $dates{'day2'} = ( $dates{'day'} < 10 ) ? "0" . $dates{'day'} : $dates{'day'};
479: $dates{'mon'}++;
480: $dates{'mon2'} = ( $dates{'mon'} < 10 ) ? "0".$dates{'mon'} : $dates{'mon'};
1.21 nick 481: my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
482: $dates{'wday'} = $days[$dates{'dow'}];
1.1 nick 483:
484: return %dates;
485: }
1.8 nick 486:
487: ###############################################################################
488: ##
489: ## &fetchOptions( );
490: ##
491: ## Grab our command line arguments and toss them in to a hash
492: ##
493: ###############################################################################
494: sub fetchOptions {
495: my %opts;
496:
497: &GetOptions(
498: "days:i" => \$opts{'days'},
499: "help|?" => \$opts{'help'},
500: "man" => \$opts{'man'},
501: ) || &pod2usage( );
502: &pod2usage( ) if defined $opts{'help'};
503: &pod2usage( { -verbose => 2, -input => \*DATA } ) if defined $opts{'man'};
504:
505: return %opts;
506: }
507:
508: __END__
509:
510: =head1 NAME
511:
512: fetch.pl - Fetches comics and places them all locally in a single html file.
513:
514: =head1 SYNOPSIS
515:
516: fetch.pl [options]
517:
518: Options:
519: --days,d Fetch comics from X days ago
520: --help,? Display the basic help menu
521: --man,m Display the detailed man page
522:
523: =head1 DESCRIPTION
524:
525: =head1 HISTORY
526:
527: =head1 AUTHOR
528:
529: Nicholas DeClario <nick@declario.com>
530:
531: =head1 BUGS
532:
533: This is a work in progress. Please report all bugs to the author.
534:
535: =head1 SEE ALSO
536:
537: =head1 COPYRIGHT
538:
539: =cut
540:
541:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>