Annotation of comics/fetch.pl.new, revision 1.25
1.1 nick 1: #!/usr/bin/perl -w
2:
1.15 nick 3: ###############################################################################
1.16 nick 4: # $Log: fetch.pl.new,v $
1.25 ! nick 5: # Revision 1.24 2018/02/06 14:31:06 nick
! 6: # A status report is now generated in JSON that can easily be scanned so that
! 7: # I can be alerted when there are failures that I miss if I don't read the
! 8: # comics that day.
! 9: #
1.24 nick 10: # Revision 1.23 2018/01/26 13:05:27 nick
11: # Added a new config option to remove all newline from the resulting index.html
12: # file. This allows for easier parsing for certain comics. I then updated
13: # the URLs to search for and enabled the newline removal for a handful
14: # of uComics.
15: #
16: # I believe I've also properly fixed the Comic Config version displayed on
17: # the webpage itself.
18: #
1.23 nick 19: # Revision 1.22 2017/12/05 13:37:40 nick
20: # Added the CVS config version to the outpuit.
21: #
1.22 nick 22: # Revision 1.21 2015/10/26 14:25:40 nick
23: # Fixed a bug that was improperly including the day of week string preventing the weekend comics from fetching proproperly.
24: #
1.21 nick 25: # Revision 1.20 2015/10/22 12:58:44 nick
26: # Added the ability for Sunday only comics. Stonesoup is no longer weekdays, this has been added to Sunday only. I also added Foxtrot Classics for weekdays and Foxtrot for Sundays.
27: #
1.20 nick 28: # Revision 1.19 2015/07/13 12:56:58 nick
29: # Added Sally Forth and Pearls Before Swine. Adding Sally Forth required a change in the 'wget' command for fetching the index file to include 'user-agent' and 'referer'.
30: #
1.19 nick 31: # Revision 1.18 2015/05/07 12:31:43 nick
32: # Added favicon
33: #
1.18 nick 34: # Revision 1.17 2015/02/19 14:56:10 nick
35: # Fixed a problem that forced everything to JPG. This would kill GIF animations, but would not display the gifs either because 'convert' appends an index number to the end of the file name for each from of the GIF animation. I fixed this to maintain GIF compatibilty as well as rewritting how the script fetches the size of the file. Additionally, I updated the configuration for Questionable Content to search for GIF or JPG, which is what triggered this entire update.
36: #
1.17 nick 37: # Revision 1.16 2015/02/05 18:05:58 nick
38: # Changed the background and added a fancy title.
39: #
1.16 nick 40: # Revision 1.15 2015/01/19 13:46:19 nick
41: # *** empty log message ***
42: #
1.15 nick 43: ###############################################################################
44:
1.1 nick 45: use strict;
46: use File::Path;
47: use Data::Dumper;
1.8 nick 48: use Pod::Usage;
49: use Getopt::Long;
1.24 nick 50: use JSON::Create 'create_json';
1.21 nick 51: use Date::Calc qw/Date_to_Text_Long Today Day_of_Week Day_of_Week_to_Text/;
1.16 nick 52:
1.1 nick 53: ##
54: ## Some default values
55: ##
1.25 ! nick 56: my $ver = '$Id: fetch.pl.new,v 1.24 2018/02/06 14:31:06 nick Exp $';
1.1 nick 57: my $comicFile = "comics.conf";
1.22 nick 58: my $comicConfigVer = "Unknown";
1.24 nick 59: my $reportFile = "/home/httpd/html/daily/comics/status_report.json";
1.1 nick 60: my %comics = &readComicConfig ( $comicFile );
1.8 nick 61: my %opts = &fetchOptions( );
62: my $days_ago = $opts{'days'} || 0;
1.1 nick 63: my %dates = &fetchDates();
64: my $baseDir = $comics{'configs'}{'base_directory'} || ".";
65: my $imageDir = $baseDir . "/" . ( $comics{'configs'}{'image_directory'} || "images" ) .
66: "/$dates{'mon2'}$dates{'year2'}";
67: my $indexDir = $baseDir . "/" . ( $comics{'configs'}{'index_directory'} || "indexes" );
1.2 nick 68: my $USER_AGENT = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18) Gecko/20110628 Ubuntu/10.10 (maverick) Firefox/3.6.18";
1.8 nick 69: my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
1.1 nick 70:
71: my $DATE=`date`; chomp $DATE;
72: print STDOUT "Starting comic fetch at $DATE\n";
73:
74: ##
75: ## Main program starts here
76: ##
77: &checkDir ( [ $imageDir, $indexDir ] );
78:
1.5 nick 79: &writeTitle ( \%dates );
1.1 nick 80:
81: foreach my $comic ( sort keys %comics ) {
1.20 nick 82:
83: ## Skip if this is Sunday and the comic is weekdays only
1.1 nick 84: next if ( $comic =~ m/config/ );
1.21 nick 85: if (($dates{'wday'} eq "Sunday") &&
1.20 nick 86: ($comics{$comic}{'sunday'} == 0)) {
87: print "Skipping '$comic'; Weekdays only.\n";
88: next;
89: }
90:
91: ## Skip if Sunday only comic and it's not Sunday.
1.21 nick 92: if (($dates{'wday'} ne "Sunday") &&
1.20 nick 93: ($comics{$comic}{'sunday_only'} == 1)) {
1.21 nick 94: print "Skipping '$comic' ($comics{$comic}{'sunday_only'}); Sunday only.\n";
1.20 nick 95: next
96: }
97:
1.1 nick 98: $comics{$comic}{'error'} = &downloadComic ( \%comics, $comic, \%dates );
99: &writeComic ( \%comics, $comic, \%dates );
100:
1.17 nick 101: my $file = "$imageDir/$comic-$dates{'day2'}.$comics{$comic}{'ext'}";
102: my $size = 0;
103:
104: my $cmd = "/usr/bin/identify -verbose $file|";
105: open(IMG, $cmd) || die ("Can't open: $!\n");
106: while(<IMG>) {
107: if ($_ =~ m/^\s+geometry:\s+(\d+)x\d+.*/i) {
108: $size = $1 if ( $size == 0);
109: }
110: }
111: close(IMG);
1.4 nick 112:
1.19 nick 113:
1.4 nick 114: system( "/usr/bin/convert -resize 640 $file $file" )
115: if ( $size > 640 )
116: }
117:
1.1 nick 118: ## &writeMainIndex ( \%dates );
119:
120: &writeFooter( \%dates );
121:
1.24 nick 122: print STDOUT "Status written to $reportFile.\n"
123: if (&writeStatusReportJSON(\%comics, $reportFile));
124:
1.1 nick 125: $DATE=`date`; chomp( $DATE );
126: print STDOUT "Completed comic fetch at $DATE\n";
127:
128: ## End
129:
130: #######################################################################
131: ## Function : downloadComic
132: ##
133: ## Description :
134: ## This function determines the download method being used to
135: ## retrieve the comic and calls the apprioriate function.
136: ##
137: ## If the mode is invalid an error will be returned.
138: ##
139: #######################################################################
140: sub downloadComic ($$) {
141: my ( $comics, $comic, $date ) = @_;
142:
143: SWITCH: {
144: if ( $comics->{$comic}{'mode'} eq 1 ) {
145: return indexDownload ( \%comics, $comic, $date );
146: last SWITCH;
147: }
148: if ( $comics->{$comic}{'mode'} eq 2 ) {
149: return directDownload ( \%comics, $comic, $date );
150: last SWITCH;
151: }
152: }
153:
154: return "ERROR: Unknown download method specified for $comics->{$comic}{'fullName'}.";
155: }
156:
157: #######################################################################
158: #######################################################################
159: sub readComicConfig ($$) {
160: my ( $comicFile ) = @_;
161: my %comicConfig = ( );
162: my %config = ( );
163:
1.14 nick 164: my ($year, $mon, $day) =( localtime(time))[5,4,3];
165: $year += 1900;
166: $mon = sprintf("%02d", ($mon + 1));
167: $day = sprintf("%02d", $day);
168:
1.1 nick 169: open FILEN, "<$comicFile";
170: while (<FILEN>) {
1.24 nick 171: #if ($_ =~ m/^#.* \$[Ii][Dd]: fetch.pl.new,v 1.23 2018/01/26 13:05:27 nick Exp $/) {
172: if ($_ =~ m/^#.* \$[Ii][dD]: .*,v\ (.*)\ \d{4}\/.*\$$/) {
1.22 nick 173: $comicConfigVer = $1;
174: }
1.1 nick 175: if ( ( $_ !~ m/^#/ ) && ( $_ =~ m/,.*,/) ){
1.14 nick 176: $_ =~ s/__YEAR__/$year/g;
177: $_ =~ s/__MON__/$mon/g;
178: $_ =~ s/__DAY__/$day/g;
179:
1.1 nick 180: my @res = split /,/, $_;
181: $comicConfig{$res[0]}{'url'} = $res[1];
182: $comicConfig{$res[0]}{'search'} = $res[2];
183: $comicConfig{$res[0]}{'mode'} = $res[3];
184: $comicConfig{$res[0]}{'fullName'} = $res[4];
185: $comicConfig{$res[0]}{'ext'} = $res[5];
1.21 nick 186: $comicConfig{$res[0]}{'sunday'} = sprintf("%d", $res[6] || 1);
187: $comicConfig{$res[0]}{'sunday_only'} = sprintf("%d", $res[7] || 0);
1.23 nick 188: $comicConfig{$res[0]}{'remove_newlines'} = sprintf("%d", $res[8] || 0);
1.1 nick 189: $comicConfig{$res[0]}{'error'} = 0;
190: }
191: elsif ( $_ =~ m/(.*)\s+=\s+(.*)/ ) {
192: $comicConfig{'configs'}{$1} = $2;
193: }
194: }
195: close (FILEN);
196:
197: return %comicConfig;
198: }
199:
200: #######################################################################
201: #######################################################################
1.24 nick 202: sub writeStatusReportJSON ($$) {
203: my ( $comicsRef, $filename ) = @_;
204: my %comics = %$comicsRef;
1.25 ! nick 205: my $shortDate = sprintf("%d%02d%02d", (localtime)[5] + 1900,
! 206: (localtime)[4] + 1,
! 207: (localtime)[3]);
! 208: my %json = ('date' => $shortDate, 'comics' => []);
1.24 nick 209: my $totalErrors = 0;
210:
211: foreach my $comic (sort keys %comics) {
212: next unless $comics{$comic}{'fullName'};
213: if ($comics{$comic}{'error'}) {
214: my %error = ('comicName' => "$comics{$comic}{'fullName'}",
215: 'error' => "$comics{$comic}{'error'}",
216: 'status' => "Error");
217: push $json{'comics'}, \%error;
218: $totalErrors += 1;
219: } else {
220: my %status = ('comicName' => "$comics{$comic}{'fullName'}",
221: 'error' => 0,
222: 'status' => "Successfull");
223: push $json{'comics'}, \%status;
224: }
225: }
226: $json{'totalErrors'} = $totalErrors;
227:
228: open SR, ">$filename" or die ("ERROR: Failed to create status report: $!\n");
229: print SR create_json (\%json);
230: close(SR);
231: }
232:
233: #######################################################################
234: #######################################################################
1.1 nick 235: sub writeComic ($$) {
236: my ( $comics, $comic, $date ) = @_;
1.11 nick 237: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 238: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
239: $date->{'mon2'} . $date->{'day2'} . "-" .
240: $sd . ".html";
1.1 nick 241: my $content = <<EOF;
242:
243: <!-- ********* Begin $comic ($comics->{$comic}{'fullName'}) ******* -->
244: <tr>
245: <td align="left">
246: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font>
247: <font size="-2">
248: <a href="$comics->{$comic}{'url'}">
249: $comics->{$comic}{'url'}
250: </a>
251: </font><br/>
1.17 nick 252: <img src="../images/$date->{'mon2'}$date->{'year2'}/$comic-$date->{'day2'}.$comics->{$comic}{'ext'}" alt="$comic-$date->{'day2'}" />
1.1 nick 253: <br/><br/>
254: </td></tr>
255: <!-- ********* Finish $comic ($comics->{$comic}{'fullName'}) ******* -->
256:
257: EOF
258: open INDEX, ">>$indexFile";
259:
260: print INDEX $content if ( ! $comics->{$comic}{'error'} );
261:
262: print INDEX <<EOF
263: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font>
264: <font size="-2"><
265: <a href="$comics->{$comic}{'url'}">
266: $comics->{$comic}{'url'}
267: </a>
268: </font><br/>
269: <font color="red"><b>$comic : $comics->{$comic}{'error'}</b></font><br/>
270: </td>
271: </tr>
272: EOF
273: if ( $comics->{$comic}{'error'} );
274:
275: close (INDEX);
276:
277: return 0;
278: }
279:
280:
281: #######################################################################
282: #######################################################################
283: sub writeMainIndex ($$) {
284: my ( $date ) = @_;
285:
286: }
287:
288:
289: #######################################################################
290: #######################################################################
291: sub writeFooter {
292: my ( $date ) = @_;
1.11 nick 293: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 294: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
295: $date->{'mon2'} . $date->{'day2'} . "-" .
296: $sd . ".html";
1.1 nick 297: my $sysDate = `date`;
298:
299: open INDEX, ">>$indexFile";
300: print INDEX <<EOF;
301: </table>
1.3 nick 302: <center>
303: <font size="2">
304: Generated on: <font color="green">$sysDate</font><br/>
1.7 nick 305: Version: <font color="green">$ver</font><br />
1.22 nick 306: Config Version: <font color="green">$comicConfigVer</font><br />
1.7 nick 307: CVS: <a href="http://demandred.dyndns.org/cgi-bin/cvsweb/comics/">http://demandred.dyndns.org/cgi-bin/cvsweb/comics/</a>
1.1 nick 308: <p>
309: <a href="http://validator.w3.org/check?uri=referer"><img
310: src="http://www.w3.org/Icons/valid-xhtml10-blue" alt="Valid XHTML 1.0 Transitional" height="31" width="88" border="0" /></a>
311: </p>
312: </center>
313:
314: </body>
315: </html>
316: EOF
317: close( INDEX );
318: }
319:
320: #######################################################################
321: #######################################################################
322: sub checkDir ($$) {
323: my @dir = @_;
324:
325: foreach ( @dir ) {
326: if ( ! -d $_ ) { mkpath( $_ ); }
327: }
328: }
329:
330: #######################################################################
331: #######################################################################
332: sub writeTitle ($$) {
333: my ( $date ) = @_;
1.11 nick 334: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 335: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
336: $date->{'mon2'} . $date->{'day2'} . "-" .
337: $sd . ".html";
1.8 nick 338: my $today = $days[$date->{'dow'}] . " " . $date->{'mon'} . "/" . $date->{'day'} . "/" . $date->{'year'};
1.16 nick 339: my $today_long = Date_to_Text_Long(Today());
1.1 nick 340:
341: open INDEX, ">$indexFile";
342: print INDEX <<EOF;
343: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
344:
345: <html xmlns="http://www.w3.org/1999/xhtml">
346: <head>
347: <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
1.13 nick 348: <link href="/daily/comics/comics.css" type="text/css" rel="stylesheet" media="screen">
1.18 nick 349: <link rel="shortcut icon" href="./favicon.ico">
1.1 nick 350: <title>Daily Comics for $today</title>
351: </head>
352: <body bgcolor="#FFFFFF">
353: <table align="center" cellpadding="5" cellspacing="0">
1.16 nick 354: <tr><td>
355: <table cellpadding="0" cellspacing="0" border="0">
356: <tr><td align="Left"><img src="images/daily_comics_heading01.png"></td></tr>
357: <tr><td align="left">$today_long</td></tr>
358: <tr><td> </td></tr>
359: </td</tr>
360:
1.1 nick 361: EOF
362: close (INDEX);
363: }
364:
365: #######################################################################
366: #######################################################################
367: sub directDownload ($$) {
368: my ( $comics, $comic, $date ) = @_;
369: my $file = &parseComic ( $comics, $comic, $date );
370:
371: ##
372: ## Save the file to the appropriate directory
373: ##
374: my $cDir = $date->{'mon2'} . $date->{'year2'};
375: my $cDate = $date->{'day2'};
376:
1.18 nick 377: my $cmd = "wget -q $file --referer=\"" . $comics->{$comic}{'url'} ."\" --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg";
1.14 nick 378:
1.1 nick 379: return system($cmd);
380: }
381:
382: #######################################################################
383: #######################################################################
384: sub indexDownload ($$) {
385: my ( $comics, $comic, $date ) = @_;
386: my ( @lines, $comicLine, $mainURL );
387: my $comicIndex = "indexes/index.$comic";
388:
1.19 nick 389: my $wget_cmd = "wget -q --referer=\"$comics->{$comic}{'url'}\" " .
390: "--user-agent=\"$USER_AGENT\" " .
391: "$comics->{$comic}{'url'} -O $comicIndex";
392: system($wget_cmd);
1.1 nick 393:
394: if ( ! open FILEN, "<$comicIndex" ) {
395: return "ERROR: Can't open index file for " . $comics->{$comic}{'fullName'} .
396: " (" . $comics->{$comic}{'url'} . ")";
397: }
1.23 nick 398: while (<FILEN>) {
399: my $line = $_;
400: $line =~ s/\R|\ \ +|\t//g if ( $comics->{$comic}{'remove_newliens'} );
401: push @lines, $line;
402: }
1.1 nick 403: close (FILEN);
404:
405: unlink ("$comicIndex");
406:
407: $mainURL = $comics->{$comic}{'url'};
408: ## I need to figure out how to merge these two in to one regex.
409: $mainURL =~ s/(http:\/\/.*)(?:\/.*\/){1,}.*/$1/;
410: $mainURL =~ s/([a-z])\/.*/$1/i;
411:
412: ##
413: ## Find the comic strip URL based on the specified regex in the search
414: ##
415: foreach my $line (@lines) {
1.17 nick 416: if ( $line =~ m/$comics->{$comic}{'search'}/i ) {
1.1 nick 417: $comicLine = $1; chomp $comicLine;
418: }
1.17 nick 419: }
1.1 nick 420:
421: ##
422: ## Save the file to the appropriate directory
423: ##
424: my $cDir = $date->{'mon2'} . $date->{'year2'};
425: my $cDate = $date->{'day2'};
426:
427: if ( $comicLine ) {
428: if ( $comicLine =~ m/(gif|jpg|png)/i ) { $comics->{$comic}{'ext'} = $1; }
429: my $comicURL = ( $comicLine =~ m/http/ ) ? $comicLine : $mainURL . $comicLine;
1.17 nick 430: my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer=\"" . $comics->{$comic}{'url'} . "\" -q $comicURL -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}";
1.1 nick 431: system( $cmd );
432: return 0;
433: }
434:
435: unlink "index.html";
436:
437: return "ERROR: Could not download comic $comics->{$comic}{'fullName'}";
438: }
439:
440: #######################################################################
441: #######################################################################
442: sub parseComic ($$) {
443: my ( $comics, $comic, $date ) = @_;
444: my $string = $comics->{$comic}{'search'};
445:
446: $string =~ s/__year__/$date->{'year'}/g;
447: $string =~ s/__year2__/$date->{'year2'}/g;
448: $string =~ s/__mon__/$date->{'mon'}/g;
449: $string =~ s/__mon2__/$date->{'mon2'}/g;
450: $string =~ s/__day__/$date->{'day'}/g;
451: $string =~ s/__day2__/$date->{'day2'}/g;
452: $string =~ s/__ext__/$comics->{$comic}{'ext'}/g;
453: chomp $string;
454:
455: return $string;
456: }
457:
458: #######################################################################
459: #######################################################################
460: sub fetchDates () {
461: my %dates = ();
462:
1.8 nick 463: ($dates{'day'}, $dates{'mon'}, $dates{'year'}, $dates{'dow'}) = (localtime(time - (86400 * $days_ago )))[3,4,5,6];
1.1 nick 464:
465: $dates{'year'} += 1900;
466: $dates{'year2'} = substr $dates{'year'}, 2, 2;
467: $dates{'day2'} = ( $dates{'day'} < 10 ) ? "0" . $dates{'day'} : $dates{'day'};
468: $dates{'mon'}++;
469: $dates{'mon2'} = ( $dates{'mon'} < 10 ) ? "0".$dates{'mon'} : $dates{'mon'};
1.21 nick 470: my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
471: $dates{'wday'} = $days[$dates{'dow'}];
1.1 nick 472:
473: return %dates;
474: }
1.8 nick 475:
476: ###############################################################################
477: ##
478: ## &fetchOptions( );
479: ##
480: ## Grab our command line arguments and toss them in to a hash
481: ##
482: ###############################################################################
483: sub fetchOptions {
484: my %opts;
485:
486: &GetOptions(
487: "days:i" => \$opts{'days'},
488: "help|?" => \$opts{'help'},
489: "man" => \$opts{'man'},
490: ) || &pod2usage( );
491: &pod2usage( ) if defined $opts{'help'};
492: &pod2usage( { -verbose => 2, -input => \*DATA } ) if defined $opts{'man'};
493:
494: return %opts;
495: }
496:
497: __END__
498:
499: =head1 NAME
500:
501: fetch.pl - Fetches comics and places them all locally in a single html file.
502:
503: =head1 SYNOPSIS
504:
505: fetch.pl [options]
506:
507: Options:
508: --days,d Fetch comics from X days ago
509: --help,? Display the basic help menu
510: --man,m Display the detailed man page
511:
512: =head1 DESCRIPTION
513:
514: =head1 HISTORY
515:
516: =head1 AUTHOR
517:
518: Nicholas DeClario <nick@declario.com>
519:
520: =head1 BUGS
521:
522: This is a work in progress. Please report all bugs to the author.
523:
524: =head1 SEE ALSO
525:
526: =head1 COPYRIGHT
527:
528: =cut
529:
530:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>