Annotation of comics/fetch.pl.new, revision 1.28
1.1 nick 1: #!/usr/bin/perl -w
2:
1.15 nick 3: ###############################################################################
1.16 nick 4: # $Log: fetch.pl.new,v $
1.28 ! nick 5: # Revision 1.27 2019/04/15 12:50:23 nick
! 6: # The script was unable to handle html '&' and convert it, so I added that. I probably should see if there's a library or something that handles all those automagically but I just tossed a regex in there for now that does the trick.
! 7: #
1.27 nick 8: # Revision 1.26 2018/04/22 14:03:54 nick
9: # Changed the default for Sunday comics that was causing issues with some comics.
10: #
1.26 nick 11: # Revision 1.25 2018/02/12 13:30:58 nick
12: # Added an easier to compare date string to determine if the status json file was updated today and report if it wasn't.
13: #
1.25 nick 14: # Revision 1.24 2018/02/06 14:31:06 nick
15: # A status report is now generated in JSON that can easily be scanned so that
16: # I can be alerted when there are failures that I miss if I don't read the
17: # comics that day.
18: #
1.24 nick 19: # Revision 1.23 2018/01/26 13:05:27 nick
20: # Added a new config option to remove all newline from the resulting index.html
21: # file. This allows for easier parsing for certain comics. I then updated
22: # the URLs to search for and enabled the newline removal for a handful
23: # of uComics.
24: #
25: # I believe I've also properly fixed the Comic Config version displayed on
26: # the webpage itself.
27: #
1.23 nick 28: # Revision 1.22 2017/12/05 13:37:40 nick
29: # Added the CVS config version to the outpuit.
30: #
1.22 nick 31: # Revision 1.21 2015/10/26 14:25:40 nick
32: # Fixed a bug that was improperly including the day of week string preventing the weekend comics from fetching proproperly.
33: #
1.21 nick 34: # Revision 1.20 2015/10/22 12:58:44 nick
35: # Added the ability for Sunday only comics. Stonesoup is no longer weekdays, this has been added to Sunday only. I also added Foxtrot Classics for weekdays and Foxtrot for Sundays.
36: #
1.20 nick 37: # Revision 1.19 2015/07/13 12:56:58 nick
38: # Added Sally Forth and Pearls Before Swine. Adding Sally Forth required a change in the 'wget' command for fetching the index file to include 'user-agent' and 'referer'.
39: #
1.19 nick 40: # Revision 1.18 2015/05/07 12:31:43 nick
41: # Added favicon
42: #
1.18 nick 43: # Revision 1.17 2015/02/19 14:56:10 nick
44: # Fixed a problem that forced everything to JPG. This would kill GIF animations, but would not display the gifs either because 'convert' appends an index number to the end of the file name for each from of the GIF animation. I fixed this to maintain GIF compatibilty as well as rewritting how the script fetches the size of the file. Additionally, I updated the configuration for Questionable Content to search for GIF or JPG, which is what triggered this entire update.
45: #
1.17 nick 46: # Revision 1.16 2015/02/05 18:05:58 nick
47: # Changed the background and added a fancy title.
48: #
1.16 nick 49: # Revision 1.15 2015/01/19 13:46:19 nick
50: # *** empty log message ***
51: #
1.15 nick 52: ###############################################################################
53:
1.1 nick 54: use strict;
55: use File::Path;
56: use Data::Dumper;
1.8 nick 57: use Pod::Usage;
58: use Getopt::Long;
1.24 nick 59: use JSON::Create 'create_json';
1.21 nick 60: use Date::Calc qw/Date_to_Text_Long Today Day_of_Week Day_of_Week_to_Text/;
1.16 nick 61:
1.1 nick 62: ##
63: ## Some default values
64: ##
1.28 ! nick 65: my $ver = '$Id: fetch.pl.new,v 1.27 2019/04/15 12:50:23 nick Exp $';
1.1 nick 66: my $comicFile = "comics.conf";
1.22 nick 67: my $comicConfigVer = "Unknown";
1.24 nick 68: my $reportFile = "/home/httpd/html/daily/comics/status_report.json";
1.1 nick 69: my %comics = &readComicConfig ( $comicFile );
1.8 nick 70: my %opts = &fetchOptions( );
71: my $days_ago = $opts{'days'} || 0;
1.1 nick 72: my %dates = &fetchDates();
73: my $baseDir = $comics{'configs'}{'base_directory'} || ".";
74: my $imageDir = $baseDir . "/" . ( $comics{'configs'}{'image_directory'} || "images" ) .
75: "/$dates{'mon2'}$dates{'year2'}";
76: my $indexDir = $baseDir . "/" . ( $comics{'configs'}{'index_directory'} || "indexes" );
1.2 nick 77: my $USER_AGENT = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18) Gecko/20110628 Ubuntu/10.10 (maverick) Firefox/3.6.18";
1.8 nick 78: my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
1.1 nick 79:
80: my $DATE=`date`; chomp $DATE;
81: print STDOUT "Starting comic fetch at $DATE\n";
82:
83: ##
84: ## Main program starts here
85: ##
86: &checkDir ( [ $imageDir, $indexDir ] );
87:
1.5 nick 88: &writeTitle ( \%dates );
1.1 nick 89:
90: foreach my $comic ( sort keys %comics ) {
1.20 nick 91:
92: ## Skip if this is Sunday and the comic is weekdays only
1.1 nick 93: next if ( $comic =~ m/config/ );
1.21 nick 94: if (($dates{'wday'} eq "Sunday") &&
1.26 nick 95: ($comics{$comic}{'not_sunday'} == 1)) {
1.20 nick 96: print "Skipping '$comic'; Weekdays only.\n";
97: next;
98: }
99:
100: ## Skip if Sunday only comic and it's not Sunday.
1.21 nick 101: if (($dates{'wday'} ne "Sunday") &&
1.20 nick 102: ($comics{$comic}{'sunday_only'} == 1)) {
1.21 nick 103: print "Skipping '$comic' ($comics{$comic}{'sunday_only'}); Sunday only.\n";
1.20 nick 104: next
105: }
1.26 nick 106:
1.1 nick 107: $comics{$comic}{'error'} = &downloadComic ( \%comics, $comic, \%dates );
108: &writeComic ( \%comics, $comic, \%dates );
109:
1.17 nick 110: my $file = "$imageDir/$comic-$dates{'day2'}.$comics{$comic}{'ext'}";
111: my $size = 0;
112:
113: my $cmd = "/usr/bin/identify -verbose $file|";
114: open(IMG, $cmd) || die ("Can't open: $!\n");
115: while(<IMG>) {
116: if ($_ =~ m/^\s+geometry:\s+(\d+)x\d+.*/i) {
117: $size = $1 if ( $size == 0);
118: }
119: }
120: close(IMG);
1.4 nick 121:
1.19 nick 122:
1.4 nick 123: system( "/usr/bin/convert -resize 640 $file $file" )
124: if ( $size > 640 )
125: }
126:
1.1 nick 127: ## &writeMainIndex ( \%dates );
128:
129: &writeFooter( \%dates );
130:
1.24 nick 131: print STDOUT "Status written to $reportFile.\n"
132: if (&writeStatusReportJSON(\%comics, $reportFile));
133:
1.1 nick 134: $DATE=`date`; chomp( $DATE );
135: print STDOUT "Completed comic fetch at $DATE\n";
136:
137: ## End
138:
139: #######################################################################
140: ## Function : downloadComic
141: ##
142: ## Description :
143: ## This function determines the download method being used to
144: ## retrieve the comic and calls the apprioriate function.
145: ##
146: ## If the mode is invalid an error will be returned.
147: ##
148: #######################################################################
149: sub downloadComic ($$) {
150: my ( $comics, $comic, $date ) = @_;
151:
152: SWITCH: {
153: if ( $comics->{$comic}{'mode'} eq 1 ) {
154: return indexDownload ( \%comics, $comic, $date );
155: last SWITCH;
156: }
157: if ( $comics->{$comic}{'mode'} eq 2 ) {
158: return directDownload ( \%comics, $comic, $date );
159: last SWITCH;
160: }
161: }
162:
163: return "ERROR: Unknown download method specified for $comics->{$comic}{'fullName'}.";
164: }
165:
166: #######################################################################
167: #######################################################################
168: sub readComicConfig ($$) {
169: my ( $comicFile ) = @_;
170: my %comicConfig = ( );
171: my %config = ( );
172:
1.14 nick 173: my ($year, $mon, $day) =( localtime(time))[5,4,3];
174: $year += 1900;
175: $mon = sprintf("%02d", ($mon + 1));
176: $day = sprintf("%02d", $day);
177:
1.1 nick 178: open FILEN, "<$comicFile";
179: while (<FILEN>) {
1.24 nick 180: #if ($_ =~ m/^#.* \$[Ii][Dd]: fetch.pl.new,v 1.23 2018/01/26 13:05:27 nick Exp $/) {
181: if ($_ =~ m/^#.* \$[Ii][dD]: .*,v\ (.*)\ \d{4}\/.*\$$/) {
1.22 nick 182: $comicConfigVer = $1;
183: }
1.1 nick 184: if ( ( $_ !~ m/^#/ ) && ( $_ =~ m/,.*,/) ){
1.14 nick 185: $_ =~ s/__YEAR__/$year/g;
186: $_ =~ s/__MON__/$mon/g;
187: $_ =~ s/__DAY__/$day/g;
188:
1.1 nick 189: my @res = split /,/, $_;
190: $comicConfig{$res[0]}{'url'} = $res[1];
191: $comicConfig{$res[0]}{'search'} = $res[2];
192: $comicConfig{$res[0]}{'mode'} = $res[3];
193: $comicConfig{$res[0]}{'fullName'} = $res[4];
194: $comicConfig{$res[0]}{'ext'} = $res[5];
1.26 nick 195: $comicConfig{$res[0]}{'not_sunday'} = sprintf("%d", $res[6] || 0);
1.21 nick 196: $comicConfig{$res[0]}{'sunday_only'} = sprintf("%d", $res[7] || 0);
1.23 nick 197: $comicConfig{$res[0]}{'remove_newlines'} = sprintf("%d", $res[8] || 0);
1.1 nick 198: $comicConfig{$res[0]}{'error'} = 0;
199: }
200: elsif ( $_ =~ m/(.*)\s+=\s+(.*)/ ) {
201: $comicConfig{'configs'}{$1} = $2;
202: }
203: }
204: close (FILEN);
205:
206: return %comicConfig;
207: }
208:
209: #######################################################################
210: #######################################################################
1.24 nick 211: sub writeStatusReportJSON ($$) {
212: my ( $comicsRef, $filename ) = @_;
213: my %comics = %$comicsRef;
1.25 nick 214: my $shortDate = sprintf("%d%02d%02d", (localtime)[5] + 1900,
215: (localtime)[4] + 1,
216: (localtime)[3]);
1.27 nick 217: my %json = ('date' => $shortDate, 'comics' => ());
1.24 nick 218: my $totalErrors = 0;
219:
220: foreach my $comic (sort keys %comics) {
221: next unless $comics{$comic}{'fullName'};
222: if ($comics{$comic}{'error'}) {
223: my %error = ('comicName' => "$comics{$comic}{'fullName'}",
224: 'error' => "$comics{$comic}{'error'}",
225: 'status' => "Error");
1.27 nick 226: push @{$json{'comics'}}, \%error;
1.24 nick 227: $totalErrors += 1;
228: } else {
229: my %status = ('comicName' => "$comics{$comic}{'fullName'}",
230: 'error' => 0,
231: 'status' => "Successfull");
1.27 nick 232: push @{$json{'comics'}}, \%status;
1.24 nick 233: }
234: }
235: $json{'totalErrors'} = $totalErrors;
236:
237: open SR, ">$filename" or die ("ERROR: Failed to create status report: $!\n");
238: print SR create_json (\%json);
239: close(SR);
240: }
241:
242: #######################################################################
243: #######################################################################
1.1 nick 244: sub writeComic ($$) {
245: my ( $comics, $comic, $date ) = @_;
1.11 nick 246: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 247: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
248: $date->{'mon2'} . $date->{'day2'} . "-" .
249: $sd . ".html";
1.28 ! nick 250: $comics->{$comic}{'fullName'} =~ s/&/&/g;
1.1 nick 251: my $content = <<EOF;
252:
253: <!-- ********* Begin $comic ($comics->{$comic}{'fullName'}) ******* -->
254: <tr>
255: <td align="left">
256: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font>
257: <font size="-2">
258: <a href="$comics->{$comic}{'url'}">
259: $comics->{$comic}{'url'}
260: </a>
261: </font><br/>
1.17 nick 262: <img src="../images/$date->{'mon2'}$date->{'year2'}/$comic-$date->{'day2'}.$comics->{$comic}{'ext'}" alt="$comic-$date->{'day2'}" />
1.1 nick 263: <br/><br/>
264: </td></tr>
265: <!-- ********* Finish $comic ($comics->{$comic}{'fullName'}) ******* -->
266:
267: EOF
268: open INDEX, ">>$indexFile";
269:
270: print INDEX $content if ( ! $comics->{$comic}{'error'} );
271:
272: print INDEX <<EOF
273: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font>
274: <font size="-2"><
275: <a href="$comics->{$comic}{'url'}">
276: $comics->{$comic}{'url'}
277: </a>
278: </font><br/>
279: <font color="red"><b>$comic : $comics->{$comic}{'error'}</b></font><br/>
280: </td>
281: </tr>
282: EOF
283: if ( $comics->{$comic}{'error'} );
284:
285: close (INDEX);
286:
287: return 0;
288: }
289:
290:
291: #######################################################################
292: #######################################################################
293: sub writeMainIndex ($$) {
294: my ( $date ) = @_;
295:
296: }
297:
298:
299: #######################################################################
300: #######################################################################
301: sub writeFooter {
302: my ( $date ) = @_;
1.11 nick 303: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 304: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
305: $date->{'mon2'} . $date->{'day2'} . "-" .
306: $sd . ".html";
1.1 nick 307: my $sysDate = `date`;
308:
309: open INDEX, ">>$indexFile";
310: print INDEX <<EOF;
311: </table>
1.3 nick 312: <center>
1.28 ! nick 313: Generated on: <font size="2" color="green">$sysDate</font><br/>
! 314: Version: <font size="2" color="green">$ver</font><br />
! 315: Config Version: <font size="2" color="green">$comicConfigVer</font><br />
! 316: CVS: <a href="http://demandred.dyndns.org:3000/cgi-bin/cvsweb/comics/">http://demandred.dyndns.org/cgi-bin/cvsweb/comics/</a>
! 317: <br />
1.1 nick 318: <a href="http://validator.w3.org/check?uri=referer"><img
319: src="http://www.w3.org/Icons/valid-xhtml10-blue" alt="Valid XHTML 1.0 Transitional" height="31" width="88" border="0" /></a>
320: </center>
321:
322: </body>
323: </html>
324: EOF
325: close( INDEX );
326: }
327:
328: #######################################################################
329: #######################################################################
330: sub checkDir ($$) {
331: my @dir = @_;
332:
333: foreach ( @dir ) {
334: if ( ! -d $_ ) { mkpath( $_ ); }
335: }
336: }
337:
338: #######################################################################
339: #######################################################################
340: sub writeTitle ($$) {
341: my ( $date ) = @_;
1.11 nick 342: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 343: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
344: $date->{'mon2'} . $date->{'day2'} . "-" .
345: $sd . ".html";
1.8 nick 346: my $today = $days[$date->{'dow'}] . " " . $date->{'mon'} . "/" . $date->{'day'} . "/" . $date->{'year'};
1.16 nick 347: my $today_long = Date_to_Text_Long(Today());
1.1 nick 348:
349: open INDEX, ">$indexFile";
350: print INDEX <<EOF;
351: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
352:
353: <html xmlns="http://www.w3.org/1999/xhtml">
354: <head>
355: <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
1.28 ! nick 356: <link href="/daily/comics/comics.css" type="text/css" rel="stylesheet" media="screen" />
! 357: <link rel="shortcut icon" href="./favicon.ico" />
1.1 nick 358: <title>Daily Comics for $today</title>
359: </head>
360: <body bgcolor="#FFFFFF">
1.16 nick 361: <table cellpadding="0" cellspacing="0" border="0">
1.28 ! nick 362: <tr><td align="left"><img src="images/daily_comics_heading01.png" alt="Comic Page Heading" /></td></tr>
1.16 nick 363: <tr><td align="left">$today_long</td></tr>
364: <tr><td> </td></tr>
1.1 nick 365: EOF
366: close (INDEX);
367: }
368:
369: #######################################################################
370: #######################################################################
371: sub directDownload ($$) {
372: my ( $comics, $comic, $date ) = @_;
373: my $file = &parseComic ( $comics, $comic, $date );
374:
375: ##
376: ## Save the file to the appropriate directory
377: ##
378: my $cDir = $date->{'mon2'} . $date->{'year2'};
379: my $cDate = $date->{'day2'};
380:
1.27 nick 381: my $cmd = "wget -q $file --referer='" . $comics->{$comic}{'url'} ."' --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg";
1.14 nick 382:
1.1 nick 383: return system($cmd);
384: }
385:
386: #######################################################################
387: #######################################################################
388: sub indexDownload ($$) {
389: my ( $comics, $comic, $date ) = @_;
390: my ( @lines, $comicLine, $mainURL );
391: my $comicIndex = "indexes/index.$comic";
392:
1.27 nick 393: my $wget_cmd = "wget -q --referer='$comics->{$comic}{'url'}' " .
1.19 nick 394: "--user-agent=\"$USER_AGENT\" " .
395: "$comics->{$comic}{'url'} -O $comicIndex";
396: system($wget_cmd);
1.1 nick 397:
398: if ( ! open FILEN, "<$comicIndex" ) {
399: return "ERROR: Can't open index file for " . $comics->{$comic}{'fullName'} .
400: " (" . $comics->{$comic}{'url'} . ")";
401: }
1.23 nick 402: while (<FILEN>) {
403: my $line = $_;
1.27 nick 404: $line =~ s/\R|\ \ +|\t//g if ( $comics->{$comic}{'remove_newlines'} );
1.23 nick 405: push @lines, $line;
406: }
1.1 nick 407: close (FILEN);
408:
1.27 nick 409:
1.1 nick 410: unlink ("$comicIndex");
411:
412: $mainURL = $comics->{$comic}{'url'};
413: ## I need to figure out how to merge these two in to one regex.
414: $mainURL =~ s/(http:\/\/.*)(?:\/.*\/){1,}.*/$1/;
415: $mainURL =~ s/([a-z])\/.*/$1/i;
416:
417: ##
418: ## Find the comic strip URL based on the specified regex in the search
419: ##
1.27 nick 420:
1.1 nick 421: foreach my $line (@lines) {
1.17 nick 422: if ( $line =~ m/$comics->{$comic}{'search'}/i ) {
1.1 nick 423: $comicLine = $1; chomp $comicLine;
424: }
1.17 nick 425: }
1.1 nick 426:
427: ##
428: ## Save the file to the appropriate directory
429: ##
430: my $cDir = $date->{'mon2'} . $date->{'year2'};
431: my $cDate = $date->{'day2'};
432:
433: if ( $comicLine ) {
434: if ( $comicLine =~ m/(gif|jpg|png)/i ) { $comics->{$comic}{'ext'} = $1; }
435: my $comicURL = ( $comicLine =~ m/http/ ) ? $comicLine : $mainURL . $comicLine;
1.27 nick 436: # Strip &
437: $comicURL =~ s/\&\;/&/g;
438: my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer='" . $comics->{$comic}{'url'} . "' -q '$comicURL' -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}";
1.1 nick 439: system( $cmd );
440: return 0;
441: }
442:
443: unlink "index.html";
444:
445: return "ERROR: Could not download comic $comics->{$comic}{'fullName'}";
446: }
447:
448: #######################################################################
449: #######################################################################
450: sub parseComic ($$) {
451: my ( $comics, $comic, $date ) = @_;
452: my $string = $comics->{$comic}{'search'};
453:
454: $string =~ s/__year__/$date->{'year'}/g;
455: $string =~ s/__year2__/$date->{'year2'}/g;
456: $string =~ s/__mon__/$date->{'mon'}/g;
457: $string =~ s/__mon2__/$date->{'mon2'}/g;
458: $string =~ s/__day__/$date->{'day'}/g;
459: $string =~ s/__day2__/$date->{'day2'}/g;
460: $string =~ s/__ext__/$comics->{$comic}{'ext'}/g;
461: chomp $string;
462:
463: return $string;
464: }
465:
466: #######################################################################
467: #######################################################################
468: sub fetchDates () {
469: my %dates = ();
470:
1.8 nick 471: ($dates{'day'}, $dates{'mon'}, $dates{'year'}, $dates{'dow'}) = (localtime(time - (86400 * $days_ago )))[3,4,5,6];
1.1 nick 472:
473: $dates{'year'} += 1900;
474: $dates{'year2'} = substr $dates{'year'}, 2, 2;
475: $dates{'day2'} = ( $dates{'day'} < 10 ) ? "0" . $dates{'day'} : $dates{'day'};
476: $dates{'mon'}++;
477: $dates{'mon2'} = ( $dates{'mon'} < 10 ) ? "0".$dates{'mon'} : $dates{'mon'};
1.21 nick 478: my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
479: $dates{'wday'} = $days[$dates{'dow'}];
1.1 nick 480:
481: return %dates;
482: }
1.8 nick 483:
484: ###############################################################################
485: ##
486: ## &fetchOptions( );
487: ##
488: ## Grab our command line arguments and toss them in to a hash
489: ##
490: ###############################################################################
491: sub fetchOptions {
492: my %opts;
493:
494: &GetOptions(
495: "days:i" => \$opts{'days'},
496: "help|?" => \$opts{'help'},
497: "man" => \$opts{'man'},
498: ) || &pod2usage( );
499: &pod2usage( ) if defined $opts{'help'};
500: &pod2usage( { -verbose => 2, -input => \*DATA } ) if defined $opts{'man'};
501:
502: return %opts;
503: }
504:
505: __END__
506:
507: =head1 NAME
508:
509: fetch.pl - Fetches comics and places them all locally in a single html file.
510:
511: =head1 SYNOPSIS
512:
513: fetch.pl [options]
514:
515: Options:
516: --days,d Fetch comics from X days ago
517: --help,? Display the basic help menu
518: --man,m Display the detailed man page
519:
520: =head1 DESCRIPTION
521:
522: =head1 HISTORY
523:
524: =head1 AUTHOR
525:
526: Nicholas DeClario <nick@declario.com>
527:
528: =head1 BUGS
529:
530: This is a work in progress. Please report all bugs to the author.
531:
532: =head1 SEE ALSO
533:
534: =head1 COPYRIGHT
535:
536: =cut
537:
538:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>