Annotation of comics/fetch.pl.new, revision 1.26
1.1 nick 1: #!/usr/bin/perl -w
2:
1.15 nick 3: ###############################################################################
1.16 nick 4: # $Log: fetch.pl.new,v $
1.26 ! nick 5: # Revision 1.25 2018/02/12 13:30:58 nick
! 6: # Added an easier to compare date string to determine if the status json file was updated today and report if it wasn't.
! 7: #
1.25 nick 8: # Revision 1.24 2018/02/06 14:31:06 nick
9: # A status report is now generated in JSON that can easily be scanned so that
10: # I can be alerted when there are failures that I miss if I don't read the
11: # comics that day.
12: #
1.24 nick 13: # Revision 1.23 2018/01/26 13:05:27 nick
14: # Added a new config option to remove all newline from the resulting index.html
15: # file. This allows for easier parsing for certain comics. I then updated
16: # the URLs to search for and enabled the newline removal for a handful
17: # of uComics.
18: #
19: # I believe I've also properly fixed the Comic Config version displayed on
20: # the webpage itself.
21: #
1.23 nick 22: # Revision 1.22 2017/12/05 13:37:40 nick
23: # Added the CVS config version to the outpuit.
24: #
1.22 nick 25: # Revision 1.21 2015/10/26 14:25:40 nick
26: # Fixed a bug that was improperly including the day of week string preventing the weekend comics from fetching proproperly.
27: #
1.21 nick 28: # Revision 1.20 2015/10/22 12:58:44 nick
29: # Added the ability for Sunday only comics. Stonesoup is no longer weekdays, this has been added to Sunday only. I also added Foxtrot Classics for weekdays and Foxtrot for Sundays.
30: #
1.20 nick 31: # Revision 1.19 2015/07/13 12:56:58 nick
32: # Added Sally Forth and Pearls Before Swine. Adding Sally Forth required a change in the 'wget' command for fetching the index file to include 'user-agent' and 'referer'.
33: #
1.19 nick 34: # Revision 1.18 2015/05/07 12:31:43 nick
35: # Added favicon
36: #
1.18 nick 37: # Revision 1.17 2015/02/19 14:56:10 nick
38: # Fixed a problem that forced everything to JPG. This would kill GIF animations, but would not display the gifs either because 'convert' appends an index number to the end of the file name for each from of the GIF animation. I fixed this to maintain GIF compatibilty as well as rewritting how the script fetches the size of the file. Additionally, I updated the configuration for Questionable Content to search for GIF or JPG, which is what triggered this entire update.
39: #
1.17 nick 40: # Revision 1.16 2015/02/05 18:05:58 nick
41: # Changed the background and added a fancy title.
42: #
1.16 nick 43: # Revision 1.15 2015/01/19 13:46:19 nick
44: # *** empty log message ***
45: #
1.15 nick 46: ###############################################################################
47:
1.1 nick 48: use strict;
49: use File::Path;
50: use Data::Dumper;
1.8 nick 51: use Pod::Usage;
52: use Getopt::Long;
1.24 nick 53: use JSON::Create 'create_json';
1.21 nick 54: use Date::Calc qw/Date_to_Text_Long Today Day_of_Week Day_of_Week_to_Text/;
1.16 nick 55:
1.1 nick 56: ##
57: ## Some default values
58: ##
1.26 ! nick 59: my $ver = '$Id: fetch.pl.new,v 1.25 2018/02/12 13:30:58 nick Exp $';
1.1 nick 60: my $comicFile = "comics.conf";
1.22 nick 61: my $comicConfigVer = "Unknown";
1.24 nick 62: my $reportFile = "/home/httpd/html/daily/comics/status_report.json";
1.1 nick 63: my %comics = &readComicConfig ( $comicFile );
1.8 nick 64: my %opts = &fetchOptions( );
65: my $days_ago = $opts{'days'} || 0;
1.1 nick 66: my %dates = &fetchDates();
67: my $baseDir = $comics{'configs'}{'base_directory'} || ".";
68: my $imageDir = $baseDir . "/" . ( $comics{'configs'}{'image_directory'} || "images" ) .
69: "/$dates{'mon2'}$dates{'year2'}";
70: my $indexDir = $baseDir . "/" . ( $comics{'configs'}{'index_directory'} || "indexes" );
1.2 nick 71: my $USER_AGENT = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18) Gecko/20110628 Ubuntu/10.10 (maverick) Firefox/3.6.18";
1.8 nick 72: my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
1.1 nick 73:
74: my $DATE=`date`; chomp $DATE;
75: print STDOUT "Starting comic fetch at $DATE\n";
76:
77: ##
78: ## Main program starts here
79: ##
80: &checkDir ( [ $imageDir, $indexDir ] );
81:
1.5 nick 82: &writeTitle ( \%dates );
1.1 nick 83:
84: foreach my $comic ( sort keys %comics ) {
1.20 nick 85:
86: ## Skip if this is Sunday and the comic is weekdays only
1.1 nick 87: next if ( $comic =~ m/config/ );
1.21 nick 88: if (($dates{'wday'} eq "Sunday") &&
1.26 ! nick 89: ($comics{$comic}{'not_sunday'} == 1)) {
1.20 nick 90: print "Skipping '$comic'; Weekdays only.\n";
91: next;
92: }
93:
94: ## Skip if Sunday only comic and it's not Sunday.
1.21 nick 95: if (($dates{'wday'} ne "Sunday") &&
1.20 nick 96: ($comics{$comic}{'sunday_only'} == 1)) {
1.21 nick 97: print "Skipping '$comic' ($comics{$comic}{'sunday_only'}); Sunday only.\n";
1.20 nick 98: next
99: }
1.26 ! nick 100:
1.1 nick 101: $comics{$comic}{'error'} = &downloadComic ( \%comics, $comic, \%dates );
102: &writeComic ( \%comics, $comic, \%dates );
103:
1.17 nick 104: my $file = "$imageDir/$comic-$dates{'day2'}.$comics{$comic}{'ext'}";
105: my $size = 0;
106:
107: my $cmd = "/usr/bin/identify -verbose $file|";
108: open(IMG, $cmd) || die ("Can't open: $!\n");
109: while(<IMG>) {
110: if ($_ =~ m/^\s+geometry:\s+(\d+)x\d+.*/i) {
111: $size = $1 if ( $size == 0);
112: }
113: }
114: close(IMG);
1.4 nick 115:
1.19 nick 116:
1.4 nick 117: system( "/usr/bin/convert -resize 640 $file $file" )
118: if ( $size > 640 )
119: }
120:
1.1 nick 121: ## &writeMainIndex ( \%dates );
122:
123: &writeFooter( \%dates );
124:
1.24 nick 125: print STDOUT "Status written to $reportFile.\n"
126: if (&writeStatusReportJSON(\%comics, $reportFile));
127:
1.1 nick 128: $DATE=`date`; chomp( $DATE );
129: print STDOUT "Completed comic fetch at $DATE\n";
130:
131: ## End
132:
133: #######################################################################
134: ## Function : downloadComic
135: ##
136: ## Description :
137: ## This function determines the download method being used to
138: ## retrieve the comic and calls the apprioriate function.
139: ##
140: ## If the mode is invalid an error will be returned.
141: ##
142: #######################################################################
143: sub downloadComic ($$) {
144: my ( $comics, $comic, $date ) = @_;
145:
146: SWITCH: {
147: if ( $comics->{$comic}{'mode'} eq 1 ) {
148: return indexDownload ( \%comics, $comic, $date );
149: last SWITCH;
150: }
151: if ( $comics->{$comic}{'mode'} eq 2 ) {
152: return directDownload ( \%comics, $comic, $date );
153: last SWITCH;
154: }
155: }
156:
157: return "ERROR: Unknown download method specified for $comics->{$comic}{'fullName'}.";
158: }
159:
160: #######################################################################
161: #######################################################################
162: sub readComicConfig ($$) {
163: my ( $comicFile ) = @_;
164: my %comicConfig = ( );
165: my %config = ( );
166:
1.14 nick 167: my ($year, $mon, $day) =( localtime(time))[5,4,3];
168: $year += 1900;
169: $mon = sprintf("%02d", ($mon + 1));
170: $day = sprintf("%02d", $day);
171:
1.1 nick 172: open FILEN, "<$comicFile";
173: while (<FILEN>) {
1.24 nick 174: #if ($_ =~ m/^#.* \$[Ii][Dd]: fetch.pl.new,v 1.23 2018/01/26 13:05:27 nick Exp $/) {
175: if ($_ =~ m/^#.* \$[Ii][dD]: .*,v\ (.*)\ \d{4}\/.*\$$/) {
1.22 nick 176: $comicConfigVer = $1;
177: }
1.1 nick 178: if ( ( $_ !~ m/^#/ ) && ( $_ =~ m/,.*,/) ){
1.14 nick 179: $_ =~ s/__YEAR__/$year/g;
180: $_ =~ s/__MON__/$mon/g;
181: $_ =~ s/__DAY__/$day/g;
182:
1.1 nick 183: my @res = split /,/, $_;
184: $comicConfig{$res[0]}{'url'} = $res[1];
185: $comicConfig{$res[0]}{'search'} = $res[2];
186: $comicConfig{$res[0]}{'mode'} = $res[3];
187: $comicConfig{$res[0]}{'fullName'} = $res[4];
188: $comicConfig{$res[0]}{'ext'} = $res[5];
1.26 ! nick 189: $comicConfig{$res[0]}{'not_sunday'} = sprintf("%d", $res[6] || 0);
1.21 nick 190: $comicConfig{$res[0]}{'sunday_only'} = sprintf("%d", $res[7] || 0);
1.23 nick 191: $comicConfig{$res[0]}{'remove_newlines'} = sprintf("%d", $res[8] || 0);
1.1 nick 192: $comicConfig{$res[0]}{'error'} = 0;
193: }
194: elsif ( $_ =~ m/(.*)\s+=\s+(.*)/ ) {
195: $comicConfig{'configs'}{$1} = $2;
196: }
197: }
198: close (FILEN);
199:
200: return %comicConfig;
201: }
202:
203: #######################################################################
204: #######################################################################
1.24 nick 205: sub writeStatusReportJSON ($$) {
206: my ( $comicsRef, $filename ) = @_;
207: my %comics = %$comicsRef;
1.25 nick 208: my $shortDate = sprintf("%d%02d%02d", (localtime)[5] + 1900,
209: (localtime)[4] + 1,
210: (localtime)[3]);
211: my %json = ('date' => $shortDate, 'comics' => []);
1.24 nick 212: my $totalErrors = 0;
213:
214: foreach my $comic (sort keys %comics) {
215: next unless $comics{$comic}{'fullName'};
216: if ($comics{$comic}{'error'}) {
217: my %error = ('comicName' => "$comics{$comic}{'fullName'}",
218: 'error' => "$comics{$comic}{'error'}",
219: 'status' => "Error");
220: push $json{'comics'}, \%error;
221: $totalErrors += 1;
222: } else {
223: my %status = ('comicName' => "$comics{$comic}{'fullName'}",
224: 'error' => 0,
225: 'status' => "Successfull");
226: push $json{'comics'}, \%status;
227: }
228: }
229: $json{'totalErrors'} = $totalErrors;
230:
231: open SR, ">$filename" or die ("ERROR: Failed to create status report: $!\n");
232: print SR create_json (\%json);
233: close(SR);
234: }
235:
236: #######################################################################
237: #######################################################################
1.1 nick 238: sub writeComic ($$) {
239: my ( $comics, $comic, $date ) = @_;
1.11 nick 240: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 241: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
242: $date->{'mon2'} . $date->{'day2'} . "-" .
243: $sd . ".html";
1.1 nick 244: my $content = <<EOF;
245:
246: <!-- ********* Begin $comic ($comics->{$comic}{'fullName'}) ******* -->
247: <tr>
248: <td align="left">
249: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font>
250: <font size="-2">
251: <a href="$comics->{$comic}{'url'}">
252: $comics->{$comic}{'url'}
253: </a>
254: </font><br/>
1.17 nick 255: <img src="../images/$date->{'mon2'}$date->{'year2'}/$comic-$date->{'day2'}.$comics->{$comic}{'ext'}" alt="$comic-$date->{'day2'}" />
1.1 nick 256: <br/><br/>
257: </td></tr>
258: <!-- ********* Finish $comic ($comics->{$comic}{'fullName'}) ******* -->
259:
260: EOF
261: open INDEX, ">>$indexFile";
262:
263: print INDEX $content if ( ! $comics->{$comic}{'error'} );
264:
265: print INDEX <<EOF
266: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font>
267: <font size="-2"><
268: <a href="$comics->{$comic}{'url'}">
269: $comics->{$comic}{'url'}
270: </a>
271: </font><br/>
272: <font color="red"><b>$comic : $comics->{$comic}{'error'}</b></font><br/>
273: </td>
274: </tr>
275: EOF
276: if ( $comics->{$comic}{'error'} );
277:
278: close (INDEX);
279:
280: return 0;
281: }
282:
283:
284: #######################################################################
285: #######################################################################
286: sub writeMainIndex ($$) {
287: my ( $date ) = @_;
288:
289: }
290:
291:
292: #######################################################################
293: #######################################################################
294: sub writeFooter {
295: my ( $date ) = @_;
1.11 nick 296: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 297: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
298: $date->{'mon2'} . $date->{'day2'} . "-" .
299: $sd . ".html";
1.1 nick 300: my $sysDate = `date`;
301:
302: open INDEX, ">>$indexFile";
303: print INDEX <<EOF;
304: </table>
1.3 nick 305: <center>
306: <font size="2">
307: Generated on: <font color="green">$sysDate</font><br/>
1.7 nick 308: Version: <font color="green">$ver</font><br />
1.22 nick 309: Config Version: <font color="green">$comicConfigVer</font><br />
1.7 nick 310: CVS: <a href="http://demandred.dyndns.org/cgi-bin/cvsweb/comics/">http://demandred.dyndns.org/cgi-bin/cvsweb/comics/</a>
1.1 nick 311: <p>
312: <a href="http://validator.w3.org/check?uri=referer"><img
313: src="http://www.w3.org/Icons/valid-xhtml10-blue" alt="Valid XHTML 1.0 Transitional" height="31" width="88" border="0" /></a>
314: </p>
315: </center>
316:
317: </body>
318: </html>
319: EOF
320: close( INDEX );
321: }
322:
323: #######################################################################
324: #######################################################################
325: sub checkDir ($$) {
326: my @dir = @_;
327:
328: foreach ( @dir ) {
329: if ( ! -d $_ ) { mkpath( $_ ); }
330: }
331: }
332:
333: #######################################################################
334: #######################################################################
335: sub writeTitle ($$) {
336: my ( $date ) = @_;
1.11 nick 337: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 338: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
339: $date->{'mon2'} . $date->{'day2'} . "-" .
340: $sd . ".html";
1.8 nick 341: my $today = $days[$date->{'dow'}] . " " . $date->{'mon'} . "/" . $date->{'day'} . "/" . $date->{'year'};
1.16 nick 342: my $today_long = Date_to_Text_Long(Today());
1.1 nick 343:
344: open INDEX, ">$indexFile";
345: print INDEX <<EOF;
346: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
347:
348: <html xmlns="http://www.w3.org/1999/xhtml">
349: <head>
350: <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
1.13 nick 351: <link href="/daily/comics/comics.css" type="text/css" rel="stylesheet" media="screen">
1.18 nick 352: <link rel="shortcut icon" href="./favicon.ico">
1.1 nick 353: <title>Daily Comics for $today</title>
354: </head>
355: <body bgcolor="#FFFFFF">
356: <table align="center" cellpadding="5" cellspacing="0">
1.16 nick 357: <tr><td>
358: <table cellpadding="0" cellspacing="0" border="0">
359: <tr><td align="Left"><img src="images/daily_comics_heading01.png"></td></tr>
360: <tr><td align="left">$today_long</td></tr>
361: <tr><td> </td></tr>
362: </td</tr>
363:
1.1 nick 364: EOF
365: close (INDEX);
366: }
367:
368: #######################################################################
369: #######################################################################
370: sub directDownload ($$) {
371: my ( $comics, $comic, $date ) = @_;
372: my $file = &parseComic ( $comics, $comic, $date );
373:
374: ##
375: ## Save the file to the appropriate directory
376: ##
377: my $cDir = $date->{'mon2'} . $date->{'year2'};
378: my $cDate = $date->{'day2'};
379:
1.18 nick 380: my $cmd = "wget -q $file --referer=\"" . $comics->{$comic}{'url'} ."\" --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg";
1.14 nick 381:
1.1 nick 382: return system($cmd);
383: }
384:
385: #######################################################################
386: #######################################################################
387: sub indexDownload ($$) {
388: my ( $comics, $comic, $date ) = @_;
389: my ( @lines, $comicLine, $mainURL );
390: my $comicIndex = "indexes/index.$comic";
391:
1.19 nick 392: my $wget_cmd = "wget -q --referer=\"$comics->{$comic}{'url'}\" " .
393: "--user-agent=\"$USER_AGENT\" " .
394: "$comics->{$comic}{'url'} -O $comicIndex";
395: system($wget_cmd);
1.1 nick 396:
397: if ( ! open FILEN, "<$comicIndex" ) {
398: return "ERROR: Can't open index file for " . $comics->{$comic}{'fullName'} .
399: " (" . $comics->{$comic}{'url'} . ")";
400: }
1.23 nick 401: while (<FILEN>) {
402: my $line = $_;
403: $line =~ s/\R|\ \ +|\t//g if ( $comics->{$comic}{'remove_newliens'} );
404: push @lines, $line;
405: }
1.1 nick 406: close (FILEN);
407:
408: unlink ("$comicIndex");
409:
410: $mainURL = $comics->{$comic}{'url'};
411: ## I need to figure out how to merge these two in to one regex.
412: $mainURL =~ s/(http:\/\/.*)(?:\/.*\/){1,}.*/$1/;
413: $mainURL =~ s/([a-z])\/.*/$1/i;
414:
415: ##
416: ## Find the comic strip URL based on the specified regex in the search
417: ##
418: foreach my $line (@lines) {
1.17 nick 419: if ( $line =~ m/$comics->{$comic}{'search'}/i ) {
1.1 nick 420: $comicLine = $1; chomp $comicLine;
421: }
1.17 nick 422: }
1.1 nick 423:
424: ##
425: ## Save the file to the appropriate directory
426: ##
427: my $cDir = $date->{'mon2'} . $date->{'year2'};
428: my $cDate = $date->{'day2'};
429:
430: if ( $comicLine ) {
431: if ( $comicLine =~ m/(gif|jpg|png)/i ) { $comics->{$comic}{'ext'} = $1; }
432: my $comicURL = ( $comicLine =~ m/http/ ) ? $comicLine : $mainURL . $comicLine;
1.17 nick 433: my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer=\"" . $comics->{$comic}{'url'} . "\" -q $comicURL -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}";
1.1 nick 434: system( $cmd );
435: return 0;
436: }
437:
438: unlink "index.html";
439:
440: return "ERROR: Could not download comic $comics->{$comic}{'fullName'}";
441: }
442:
443: #######################################################################
444: #######################################################################
445: sub parseComic ($$) {
446: my ( $comics, $comic, $date ) = @_;
447: my $string = $comics->{$comic}{'search'};
448:
449: $string =~ s/__year__/$date->{'year'}/g;
450: $string =~ s/__year2__/$date->{'year2'}/g;
451: $string =~ s/__mon__/$date->{'mon'}/g;
452: $string =~ s/__mon2__/$date->{'mon2'}/g;
453: $string =~ s/__day__/$date->{'day'}/g;
454: $string =~ s/__day2__/$date->{'day2'}/g;
455: $string =~ s/__ext__/$comics->{$comic}{'ext'}/g;
456: chomp $string;
457:
458: return $string;
459: }
460:
461: #######################################################################
462: #######################################################################
463: sub fetchDates () {
464: my %dates = ();
465:
1.8 nick 466: ($dates{'day'}, $dates{'mon'}, $dates{'year'}, $dates{'dow'}) = (localtime(time - (86400 * $days_ago )))[3,4,5,6];
1.1 nick 467:
468: $dates{'year'} += 1900;
469: $dates{'year2'} = substr $dates{'year'}, 2, 2;
470: $dates{'day2'} = ( $dates{'day'} < 10 ) ? "0" . $dates{'day'} : $dates{'day'};
471: $dates{'mon'}++;
472: $dates{'mon2'} = ( $dates{'mon'} < 10 ) ? "0".$dates{'mon'} : $dates{'mon'};
1.21 nick 473: my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
474: $dates{'wday'} = $days[$dates{'dow'}];
1.1 nick 475:
476: return %dates;
477: }
1.8 nick 478:
479: ###############################################################################
480: ##
481: ## &fetchOptions( );
482: ##
483: ## Grab our command line arguments and toss them in to a hash
484: ##
485: ###############################################################################
486: sub fetchOptions {
487: my %opts;
488:
489: &GetOptions(
490: "days:i" => \$opts{'days'},
491: "help|?" => \$opts{'help'},
492: "man" => \$opts{'man'},
493: ) || &pod2usage( );
494: &pod2usage( ) if defined $opts{'help'};
495: &pod2usage( { -verbose => 2, -input => \*DATA } ) if defined $opts{'man'};
496:
497: return %opts;
498: }
499:
500: __END__
501:
502: =head1 NAME
503:
504: fetch.pl - Fetches comics and places them all locally in a single html file.
505:
506: =head1 SYNOPSIS
507:
508: fetch.pl [options]
509:
510: Options:
511: --days,d Fetch comics from X days ago
512: --help,? Display the basic help menu
513: --man,m Display the detailed man page
514:
515: =head1 DESCRIPTION
516:
517: =head1 HISTORY
518:
519: =head1 AUTHOR
520:
521: Nicholas DeClario <nick@declario.com>
522:
523: =head1 BUGS
524:
525: This is a work in progress. Please report all bugs to the author.
526:
527: =head1 SEE ALSO
528:
529: =head1 COPYRIGHT
530:
531: =cut
532:
533:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>