Annotation of comics/fetch.pl.new, revision 1.24
1.1 nick 1: #!/usr/bin/perl -w
2:
1.15 nick 3: ###############################################################################
1.16 nick 4: # $Log: fetch.pl.new,v $
1.24 ! nick 5: # Revision 1.23 2018/01/26 13:05:27 nick
! 6: # Added a new config option to remove all newline from the resulting index.html
! 7: # file. This allows for easier parsing for certain comics. I then updated
! 8: # the URLs to search for and enabled the newline removal for a handful
! 9: # of uComics.
! 10: #
! 11: # I believe I've also properly fixed the Comic Config version displayed on
! 12: # the webpage itself.
! 13: #
1.23 nick 14: # Revision 1.22 2017/12/05 13:37:40 nick
15: # Added the CVS config version to the outpuit.
16: #
1.22 nick 17: # Revision 1.21 2015/10/26 14:25:40 nick
18: # Fixed a bug that was improperly including the day of week string preventing the weekend comics from fetching proproperly.
19: #
1.21 nick 20: # Revision 1.20 2015/10/22 12:58:44 nick
21: # Added the ability for Sunday only comics. Stonesoup is no longer weekdays, this has been added to Sunday only. I also added Foxtrot Classics for weekdays and Foxtrot for Sundays.
22: #
1.20 nick 23: # Revision 1.19 2015/07/13 12:56:58 nick
24: # Added Sally Forth and Pearls Before Swine. Adding Sally Forth required a change in the 'wget' command for fetching the index file to include 'user-agent' and 'referer'.
25: #
1.19 nick 26: # Revision 1.18 2015/05/07 12:31:43 nick
27: # Added favicon
28: #
1.18 nick 29: # Revision 1.17 2015/02/19 14:56:10 nick
30: # Fixed a problem that forced everything to JPG. This would kill GIF animations, but would not display the gifs either because 'convert' appends an index number to the end of the file name for each from of the GIF animation. I fixed this to maintain GIF compatibilty as well as rewritting how the script fetches the size of the file. Additionally, I updated the configuration for Questionable Content to search for GIF or JPG, which is what triggered this entire update.
31: #
1.17 nick 32: # Revision 1.16 2015/02/05 18:05:58 nick
33: # Changed the background and added a fancy title.
34: #
1.16 nick 35: # Revision 1.15 2015/01/19 13:46:19 nick
36: # *** empty log message ***
37: #
1.15 nick 38: ###############################################################################
39:
1.1 nick 40: use strict;
41: use File::Path;
42: use Data::Dumper;
1.8 nick 43: use Pod::Usage;
44: use Getopt::Long;
1.24 ! nick 45: use JSON::Create 'create_json';
1.21 nick 46: use Date::Calc qw/Date_to_Text_Long Today Day_of_Week Day_of_Week_to_Text/;
1.16 nick 47:
1.1 nick 48: ##
49: ## Some default values
50: ##
1.24 ! nick 51: my $ver = '$Id: fetch.pl.new,v 1.23 2018/01/26 13:05:27 nick Exp $';
1.1 nick 52: my $comicFile = "comics.conf";
1.22 nick 53: my $comicConfigVer = "Unknown";
1.24 ! nick 54: my $reportFile = "/home/httpd/html/daily/comics/status_report.json";
1.1 nick 55: my %comics = &readComicConfig ( $comicFile );
1.8 nick 56: my %opts = &fetchOptions( );
57: my $days_ago = $opts{'days'} || 0;
1.1 nick 58: my %dates = &fetchDates();
59: my $baseDir = $comics{'configs'}{'base_directory'} || ".";
60: my $imageDir = $baseDir . "/" . ( $comics{'configs'}{'image_directory'} || "images" ) .
61: "/$dates{'mon2'}$dates{'year2'}";
62: my $indexDir = $baseDir . "/" . ( $comics{'configs'}{'index_directory'} || "indexes" );
1.2 nick 63: my $USER_AGENT = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18) Gecko/20110628 Ubuntu/10.10 (maverick) Firefox/3.6.18";
1.8 nick 64: my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
1.1 nick 65:
66: my $DATE=`date`; chomp $DATE;
67: print STDOUT "Starting comic fetch at $DATE\n";
68:
69: ##
70: ## Main program starts here
71: ##
72: &checkDir ( [ $imageDir, $indexDir ] );
73:
1.5 nick 74: &writeTitle ( \%dates );
1.1 nick 75:
76: foreach my $comic ( sort keys %comics ) {
1.20 nick 77:
78: ## Skip if this is Sunday and the comic is weekdays only
1.1 nick 79: next if ( $comic =~ m/config/ );
1.21 nick 80: if (($dates{'wday'} eq "Sunday") &&
1.20 nick 81: ($comics{$comic}{'sunday'} == 0)) {
82: print "Skipping '$comic'; Weekdays only.\n";
83: next;
84: }
85:
86: ## Skip if Sunday only comic and it's not Sunday.
1.21 nick 87: if (($dates{'wday'} ne "Sunday") &&
1.20 nick 88: ($comics{$comic}{'sunday_only'} == 1)) {
1.21 nick 89: print "Skipping '$comic' ($comics{$comic}{'sunday_only'}); Sunday only.\n";
1.20 nick 90: next
91: }
92:
1.1 nick 93: $comics{$comic}{'error'} = &downloadComic ( \%comics, $comic, \%dates );
94: &writeComic ( \%comics, $comic, \%dates );
95:
1.17 nick 96: my $file = "$imageDir/$comic-$dates{'day2'}.$comics{$comic}{'ext'}";
97: my $size = 0;
98:
99: my $cmd = "/usr/bin/identify -verbose $file|";
100: open(IMG, $cmd) || die ("Can't open: $!\n");
101: while(<IMG>) {
102: if ($_ =~ m/^\s+geometry:\s+(\d+)x\d+.*/i) {
103: $size = $1 if ( $size == 0);
104: }
105: }
106: close(IMG);
1.4 nick 107:
1.19 nick 108:
1.4 nick 109: system( "/usr/bin/convert -resize 640 $file $file" )
110: if ( $size > 640 )
111: }
112:
1.1 nick 113: ## &writeMainIndex ( \%dates );
114:
115: &writeFooter( \%dates );
116:
1.24 ! nick 117: print STDOUT "Status written to $reportFile.\n"
! 118: if (&writeStatusReportJSON(\%comics, $reportFile));
! 119:
1.1 nick 120: $DATE=`date`; chomp( $DATE );
121: print STDOUT "Completed comic fetch at $DATE\n";
122:
123: ## End
124:
125: #######################################################################
126: ## Function : downloadComic
127: ##
128: ## Description :
129: ## This function determines the download method being used to
130: ## retrieve the comic and calls the apprioriate function.
131: ##
132: ## If the mode is invalid an error will be returned.
133: ##
134: #######################################################################
135: sub downloadComic ($$) {
136: my ( $comics, $comic, $date ) = @_;
137:
138: SWITCH: {
139: if ( $comics->{$comic}{'mode'} eq 1 ) {
140: return indexDownload ( \%comics, $comic, $date );
141: last SWITCH;
142: }
143: if ( $comics->{$comic}{'mode'} eq 2 ) {
144: return directDownload ( \%comics, $comic, $date );
145: last SWITCH;
146: }
147: }
148:
149: return "ERROR: Unknown download method specified for $comics->{$comic}{'fullName'}.";
150: }
151:
152: #######################################################################
153: #######################################################################
154: sub readComicConfig ($$) {
155: my ( $comicFile ) = @_;
156: my %comicConfig = ( );
157: my %config = ( );
158:
1.14 nick 159: my ($year, $mon, $day) =( localtime(time))[5,4,3];
160: $year += 1900;
161: $mon = sprintf("%02d", ($mon + 1));
162: $day = sprintf("%02d", $day);
163:
1.1 nick 164: open FILEN, "<$comicFile";
165: while (<FILEN>) {
1.24 ! nick 166: #if ($_ =~ m/^#.* \$[Ii][Dd]: fetch.pl.new,v 1.23 2018/01/26 13:05:27 nick Exp $/) {
! 167: if ($_ =~ m/^#.* \$[Ii][dD]: .*,v\ (.*)\ \d{4}\/.*\$$/) {
1.22 nick 168: $comicConfigVer = $1;
169: }
1.1 nick 170: if ( ( $_ !~ m/^#/ ) && ( $_ =~ m/,.*,/) ){
1.14 nick 171: $_ =~ s/__YEAR__/$year/g;
172: $_ =~ s/__MON__/$mon/g;
173: $_ =~ s/__DAY__/$day/g;
174:
1.1 nick 175: my @res = split /,/, $_;
176: $comicConfig{$res[0]}{'url'} = $res[1];
177: $comicConfig{$res[0]}{'search'} = $res[2];
178: $comicConfig{$res[0]}{'mode'} = $res[3];
179: $comicConfig{$res[0]}{'fullName'} = $res[4];
180: $comicConfig{$res[0]}{'ext'} = $res[5];
1.21 nick 181: $comicConfig{$res[0]}{'sunday'} = sprintf("%d", $res[6] || 1);
182: $comicConfig{$res[0]}{'sunday_only'} = sprintf("%d", $res[7] || 0);
1.23 nick 183: $comicConfig{$res[0]}{'remove_newlines'} = sprintf("%d", $res[8] || 0);
1.1 nick 184: $comicConfig{$res[0]}{'error'} = 0;
185: }
186: elsif ( $_ =~ m/(.*)\s+=\s+(.*)/ ) {
187: $comicConfig{'configs'}{$1} = $2;
188: }
189: }
190: close (FILEN);
191:
192: return %comicConfig;
193: }
194:
195: #######################################################################
196: #######################################################################
1.24 ! nick 197: sub writeStatusReportJSON ($$) {
! 198: my ( $comicsRef, $filename ) = @_;
! 199: my %comics = %$comicsRef;
! 200: my %json = ('date' => $DATE, 'comics' => []);
! 201: my $totalErrors = 0;
! 202:
! 203: foreach my $comic (sort keys %comics) {
! 204: next unless $comics{$comic}{'fullName'};
! 205: if ($comics{$comic}{'error'}) {
! 206: my %error = ('comicName' => "$comics{$comic}{'fullName'}",
! 207: 'error' => "$comics{$comic}{'error'}",
! 208: 'status' => "Error");
! 209: push $json{'comics'}, \%error;
! 210: $totalErrors += 1;
! 211: } else {
! 212: my %status = ('comicName' => "$comics{$comic}{'fullName'}",
! 213: 'error' => 0,
! 214: 'status' => "Successfull");
! 215: push $json{'comics'}, \%status;
! 216: }
! 217: }
! 218: $json{'totalErrors'} = $totalErrors;
! 219:
! 220: open SR, ">$filename" or die ("ERROR: Failed to create status report: $!\n");
! 221: print SR create_json (\%json);
! 222: close(SR);
! 223: }
! 224:
! 225: #######################################################################
! 226: #######################################################################
1.1 nick 227: sub writeComic ($$) {
228: my ( $comics, $comic, $date ) = @_;
1.11 nick 229: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 230: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
231: $date->{'mon2'} . $date->{'day2'} . "-" .
232: $sd . ".html";
1.1 nick 233: my $content = <<EOF;
234:
235: <!-- ********* Begin $comic ($comics->{$comic}{'fullName'}) ******* -->
236: <tr>
237: <td align="left">
238: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font>
239: <font size="-2">
240: <a href="$comics->{$comic}{'url'}">
241: $comics->{$comic}{'url'}
242: </a>
243: </font><br/>
1.17 nick 244: <img src="../images/$date->{'mon2'}$date->{'year2'}/$comic-$date->{'day2'}.$comics->{$comic}{'ext'}" alt="$comic-$date->{'day2'}" />
1.1 nick 245: <br/><br/>
246: </td></tr>
247: <!-- ********* Finish $comic ($comics->{$comic}{'fullName'}) ******* -->
248:
249: EOF
250: open INDEX, ">>$indexFile";
251:
252: print INDEX $content if ( ! $comics->{$comic}{'error'} );
253:
254: print INDEX <<EOF
255: <font color="blue"><b>$comics->{$comic}{'fullName'}</b></font>
256: <font size="-2"><
257: <a href="$comics->{$comic}{'url'}">
258: $comics->{$comic}{'url'}
259: </a>
260: </font><br/>
261: <font color="red"><b>$comic : $comics->{$comic}{'error'}</b></font><br/>
262: </td>
263: </tr>
264: EOF
265: if ( $comics->{$comic}{'error'} );
266:
267: close (INDEX);
268:
269: return 0;
270: }
271:
272:
273: #######################################################################
274: #######################################################################
275: sub writeMainIndex ($$) {
276: my ( $date ) = @_;
277:
278: }
279:
280:
281: #######################################################################
282: #######################################################################
283: sub writeFooter {
284: my ( $date ) = @_;
1.11 nick 285: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 286: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
287: $date->{'mon2'} . $date->{'day2'} . "-" .
288: $sd . ".html";
1.1 nick 289: my $sysDate = `date`;
290:
291: open INDEX, ">>$indexFile";
292: print INDEX <<EOF;
293: </table>
1.3 nick 294: <center>
295: <font size="2">
296: Generated on: <font color="green">$sysDate</font><br/>
1.7 nick 297: Version: <font color="green">$ver</font><br />
1.22 nick 298: Config Version: <font color="green">$comicConfigVer</font><br />
1.7 nick 299: CVS: <a href="http://demandred.dyndns.org/cgi-bin/cvsweb/comics/">http://demandred.dyndns.org/cgi-bin/cvsweb/comics/</a>
1.1 nick 300: <p>
301: <a href="http://validator.w3.org/check?uri=referer"><img
302: src="http://www.w3.org/Icons/valid-xhtml10-blue" alt="Valid XHTML 1.0 Transitional" height="31" width="88" border="0" /></a>
303: </p>
304: </center>
305:
306: </body>
307: </html>
308: EOF
309: close( INDEX );
310: }
311:
312: #######################################################################
313: #######################################################################
314: sub checkDir ($$) {
315: my @dir = @_;
316:
317: foreach ( @dir ) {
318: if ( ! -d $_ ) { mkpath( $_ ); }
319: }
320: }
321:
322: #######################################################################
323: #######################################################################
324: sub writeTitle ($$) {
325: my ( $date ) = @_;
1.11 nick 326: my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
1.12 nick 327: my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
328: $date->{'mon2'} . $date->{'day2'} . "-" .
329: $sd . ".html";
1.8 nick 330: my $today = $days[$date->{'dow'}] . " " . $date->{'mon'} . "/" . $date->{'day'} . "/" . $date->{'year'};
1.16 nick 331: my $today_long = Date_to_Text_Long(Today());
1.1 nick 332:
333: open INDEX, ">$indexFile";
334: print INDEX <<EOF;
335: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
336:
337: <html xmlns="http://www.w3.org/1999/xhtml">
338: <head>
339: <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
1.13 nick 340: <link href="/daily/comics/comics.css" type="text/css" rel="stylesheet" media="screen">
1.18 nick 341: <link rel="shortcut icon" href="./favicon.ico">
1.1 nick 342: <title>Daily Comics for $today</title>
343: </head>
344: <body bgcolor="#FFFFFF">
345: <table align="center" cellpadding="5" cellspacing="0">
1.16 nick 346: <tr><td>
347: <table cellpadding="0" cellspacing="0" border="0">
348: <tr><td align="Left"><img src="images/daily_comics_heading01.png"></td></tr>
349: <tr><td align="left">$today_long</td></tr>
350: <tr><td> </td></tr>
351: </td</tr>
352:
1.1 nick 353: EOF
354: close (INDEX);
355: }
356:
357: #######################################################################
358: #######################################################################
359: sub directDownload ($$) {
360: my ( $comics, $comic, $date ) = @_;
361: my $file = &parseComic ( $comics, $comic, $date );
362:
363: ##
364: ## Save the file to the appropriate directory
365: ##
366: my $cDir = $date->{'mon2'} . $date->{'year2'};
367: my $cDate = $date->{'day2'};
368:
1.18 nick 369: my $cmd = "wget -q $file --referer=\"" . $comics->{$comic}{'url'} ."\" --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg";
1.14 nick 370:
1.1 nick 371: return system($cmd);
372: }
373:
374: #######################################################################
375: #######################################################################
376: sub indexDownload ($$) {
377: my ( $comics, $comic, $date ) = @_;
378: my ( @lines, $comicLine, $mainURL );
379: my $comicIndex = "indexes/index.$comic";
380:
1.19 nick 381: my $wget_cmd = "wget -q --referer=\"$comics->{$comic}{'url'}\" " .
382: "--user-agent=\"$USER_AGENT\" " .
383: "$comics->{$comic}{'url'} -O $comicIndex";
384: system($wget_cmd);
1.1 nick 385:
386: if ( ! open FILEN, "<$comicIndex" ) {
387: return "ERROR: Can't open index file for " . $comics->{$comic}{'fullName'} .
388: " (" . $comics->{$comic}{'url'} . ")";
389: }
1.23 nick 390: while (<FILEN>) {
391: my $line = $_;
392: $line =~ s/\R|\ \ +|\t//g if ( $comics->{$comic}{'remove_newliens'} );
393: push @lines, $line;
394: }
1.1 nick 395: close (FILEN);
396:
397: unlink ("$comicIndex");
398:
399: $mainURL = $comics->{$comic}{'url'};
400: ## I need to figure out how to merge these two in to one regex.
401: $mainURL =~ s/(http:\/\/.*)(?:\/.*\/){1,}.*/$1/;
402: $mainURL =~ s/([a-z])\/.*/$1/i;
403:
404: ##
405: ## Find the comic strip URL based on the specified regex in the search
406: ##
407: foreach my $line (@lines) {
1.17 nick 408: if ( $line =~ m/$comics->{$comic}{'search'}/i ) {
1.1 nick 409: $comicLine = $1; chomp $comicLine;
410: }
1.17 nick 411: }
1.1 nick 412:
413: ##
414: ## Save the file to the appropriate directory
415: ##
416: my $cDir = $date->{'mon2'} . $date->{'year2'};
417: my $cDate = $date->{'day2'};
418:
419: if ( $comicLine ) {
420: if ( $comicLine =~ m/(gif|jpg|png)/i ) { $comics->{$comic}{'ext'} = $1; }
421: my $comicURL = ( $comicLine =~ m/http/ ) ? $comicLine : $mainURL . $comicLine;
1.17 nick 422: my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer=\"" . $comics->{$comic}{'url'} . "\" -q $comicURL -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}";
1.1 nick 423: system( $cmd );
424: return 0;
425: }
426:
427: unlink "index.html";
428:
429: return "ERROR: Could not download comic $comics->{$comic}{'fullName'}";
430: }
431:
432: #######################################################################
433: #######################################################################
434: sub parseComic ($$) {
435: my ( $comics, $comic, $date ) = @_;
436: my $string = $comics->{$comic}{'search'};
437:
438: $string =~ s/__year__/$date->{'year'}/g;
439: $string =~ s/__year2__/$date->{'year2'}/g;
440: $string =~ s/__mon__/$date->{'mon'}/g;
441: $string =~ s/__mon2__/$date->{'mon2'}/g;
442: $string =~ s/__day__/$date->{'day'}/g;
443: $string =~ s/__day2__/$date->{'day2'}/g;
444: $string =~ s/__ext__/$comics->{$comic}{'ext'}/g;
445: chomp $string;
446:
447: return $string;
448: }
449:
450: #######################################################################
451: #######################################################################
452: sub fetchDates () {
453: my %dates = ();
454:
1.8 nick 455: ($dates{'day'}, $dates{'mon'}, $dates{'year'}, $dates{'dow'}) = (localtime(time - (86400 * $days_ago )))[3,4,5,6];
1.1 nick 456:
457: $dates{'year'} += 1900;
458: $dates{'year2'} = substr $dates{'year'}, 2, 2;
459: $dates{'day2'} = ( $dates{'day'} < 10 ) ? "0" . $dates{'day'} : $dates{'day'};
460: $dates{'mon'}++;
461: $dates{'mon2'} = ( $dates{'mon'} < 10 ) ? "0".$dates{'mon'} : $dates{'mon'};
1.21 nick 462: my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
463: $dates{'wday'} = $days[$dates{'dow'}];
1.1 nick 464:
465: return %dates;
466: }
1.8 nick 467:
468: ###############################################################################
469: ##
470: ## &fetchOptions( );
471: ##
472: ## Grab our command line arguments and toss them in to a hash
473: ##
474: ###############################################################################
475: sub fetchOptions {
476: my %opts;
477:
478: &GetOptions(
479: "days:i" => \$opts{'days'},
480: "help|?" => \$opts{'help'},
481: "man" => \$opts{'man'},
482: ) || &pod2usage( );
483: &pod2usage( ) if defined $opts{'help'};
484: &pod2usage( { -verbose => 2, -input => \*DATA } ) if defined $opts{'man'};
485:
486: return %opts;
487: }
488:
489: __END__
490:
491: =head1 NAME
492:
493: fetch.pl - Fetches comics and places them all locally in a single html file.
494:
495: =head1 SYNOPSIS
496:
497: fetch.pl [options]
498:
499: Options:
500: --days,d Fetch comics from X days ago
501: --help,? Display the basic help menu
502: --man,m Display the detailed man page
503:
504: =head1 DESCRIPTION
505:
506: =head1 HISTORY
507:
508: =head1 AUTHOR
509:
510: Nicholas DeClario <nick@declario.com>
511:
512: =head1 BUGS
513:
514: This is a work in progress. Please report all bugs to the author.
515:
516: =head1 SEE ALSO
517:
518: =head1 COPYRIGHT
519:
520: =cut
521:
522:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>