--- comics/fetch.pl.new 2017/12/05 13:37:40 1.22 +++ comics/fetch.pl.new 2024/12/13 16:03:49 1.31 @@ -2,6 +2,41 @@ ############################################################################### # $Log: fetch.pl.new,v $ +# Revision 1.31 2024/12/13 16:03:49 nick +# This adds the ability to specify a comic as a link only with a default splash image. +# +# Revision 1.30 2022/10/04 12:02:03 nick +# Added --no-check-certificate for wget calls as arcamax was failing its cert check. Meh, whatever. It's just comics. +# +# Revision 1.29 2020/06/10 21:32:52 nick +# Centered page +# +# Revision 1.28 2020/06/10 21:14:31 nick +# Updated for w3 validation. +# +# Revision 1.27 2019/04/15 12:50:23 nick +# The script was unable to handle html '&' and convert it, so I added that. I probably should see if there's a library or something that handles all those automagically but I just tossed a regex in there for now that does the trick. +# +# Revision 1.26 2018/04/22 14:03:54 nick +# Changed the default for Sunday comics that was causing issues with some comics. +# +# Revision 1.25 2018/02/12 13:30:58 nick +# Added an easier to compare date string to determine if the status json file was updated today and report if it wasn't. +# +# Revision 1.24 2018/02/06 14:31:06 nick +# A status report is now generated in JSON that can easily be scanned so that +# I can be alerted when there are failures that I miss if I don't read the +# comics that day. +# +# Revision 1.23 2018/01/26 13:05:27 nick +# Added a new config option to remove all newline from the resulting index.html +# file. This allows for easier parsing for certain comics. I then updated +# the URLs to search for and enabled the newline removal for a handful +# of uComics. +# +# I believe I've also properly fixed the Comic Config version displayed on +# the webpage itself. +# # Revision 1.22 2017/12/05 13:37:40 nick # Added the CVS config version to the outpuit. # @@ -33,15 +68,19 @@ use File::Path; use Data::Dumper; use Pod::Usage; use Getopt::Long; - +use JSON::Create 'create_json'; use Date::Calc qw/Date_to_Text_Long Today Day_of_Week Day_of_Week_to_Text/; +use Data::Dumper; + +print("Running"); ## ## Some default values ## -my $ver = '$Id: fetch.pl.new,v 1.22 2017/12/05 13:37:40 nick Exp $'; +my $ver = '$Id: fetch.pl.new,v 1.31 2024/12/13 16:03:49 nick Exp $'; my $comicFile = "comics.conf"; my $comicConfigVer = "Unknown"; +my $reportFile = "/home/httpd/html/daily/comics/status_report.json"; my %comics = &readComicConfig ( $comicFile ); my %opts = &fetchOptions( ); my $days_ago = $opts{'days'} || 0; @@ -65,10 +104,12 @@ print STDOUT "Starting comic fetch at $D foreach my $comic ( sort keys %comics ) { + print("Checking Comic $comic\n"); + ## Skip if this is Sunday and the comic is weekdays only next if ( $comic =~ m/config/ ); if (($dates{'wday'} eq "Sunday") && - ($comics{$comic}{'sunday'} == 0)) { + ($comics{$comic}{'not_sunday'} == 1)) { print "Skipping '$comic'; Weekdays only.\n"; next; } @@ -79,31 +120,36 @@ foreach my $comic ( sort keys %comics ) print "Skipping '$comic' ($comics{$comic}{'sunday_only'}); Sunday only.\n"; next } - + $comics{$comic}{'error'} = &downloadComic ( \%comics, $comic, \%dates ); &writeComic ( \%comics, $comic, \%dates ); my $file = "$imageDir/$comic-$dates{'day2'}.$comics{$comic}{'ext'}"; my $size = 0; - my $cmd = "/usr/bin/identify -verbose $file|"; - open(IMG, $cmd) || die ("Can't open: $!\n"); - while() { - if ($_ =~ m/^\s+geometry:\s+(\d+)x\d+.*/i) { - $size = $1 if ( $size == 0); - } - } - close(IMG); - - - system( "/usr/bin/convert -resize 640 $file $file" ) - if ( $size > 640 ) + ## Resize downloaded images + if($comics{$comic}{'mode'} != 3) { + my $cmd = "/usr/bin/identify -verbose $file|"; + open(IMG, $cmd) || die ("Can't open: $!\n"); + while() { + if ($_ =~ m/^\s+geometry:\s+(\d+)x\d+.*/i) { + $size = $1 if ( $size == 0); + } + } + close(IMG); + + system( "/usr/bin/convert -resize 800 $file $file" ) + if ( $size > 800 ) + } } ## &writeMainIndex ( \%dates ); &writeFooter( \%dates ); +print STDOUT "Status written to $reportFile.\n" + if (&writeStatusReportJSON(\%comics, $reportFile)); + $DATE=`date`; chomp( $DATE ); print STDOUT "Completed comic fetch at $DATE\n"; @@ -131,6 +177,10 @@ sub downloadComic ($$) { return directDownload ( \%comics, $comic, $date ); last SWITCH; } + if ( $comics->{$comic}{'mode'} eq 3 ) { + return 0; + last SWITCH; + } } return "ERROR: Unknown download method specified for $comics->{$comic}{'fullName'}."; @@ -150,7 +200,8 @@ sub readComicConfig ($$) { open FILEN, "<$comicFile"; while () { - if ($_ =~ m/^#.* \$Id: fetch.pl.new,v 1.22 2017/12/05 13:37:40 nick Exp $/) { + #if ($_ =~ m/^#.* \$[Ii][Dd]: fetch.pl.new,v 1.23 2018/01/26 13:05:27 nick Exp $/) { + if ($_ =~ m/^#.* \$[Ii][dD]: .*,v\ (.*)\ \d{4}\/.*\$$/) { $comicConfigVer = $1; } if ( ( $_ !~ m/^#/ ) && ( $_ =~ m/,.*,/) ){ @@ -164,8 +215,9 @@ sub readComicConfig ($$) { $comicConfig{$res[0]}{'mode'} = $res[3]; $comicConfig{$res[0]}{'fullName'} = $res[4]; $comicConfig{$res[0]}{'ext'} = $res[5]; - $comicConfig{$res[0]}{'sunday'} = sprintf("%d", $res[6] || 1); + $comicConfig{$res[0]}{'not_sunday'} = sprintf("%d", $res[6] || 0); $comicConfig{$res[0]}{'sunday_only'} = sprintf("%d", $res[7] || 0); + $comicConfig{$res[0]}{'remove_newlines'} = sprintf("%d", $res[8] || 0); $comicConfig{$res[0]}{'error'} = 0; } elsif ( $_ =~ m/(.*)\s+=\s+(.*)/ ) { @@ -179,12 +231,47 @@ sub readComicConfig ($$) { ####################################################################### ####################################################################### +sub writeStatusReportJSON ($$) { + my ( $comicsRef, $filename ) = @_; + my %comics = %$comicsRef; + my $shortDate = sprintf("%d%02d%02d", (localtime)[5] + 1900, + (localtime)[4] + 1, + (localtime)[3]); + my %json = ('date' => $shortDate, 'comics' => ()); + my $totalErrors = 0; + + foreach my $comic (sort keys %comics) { + next unless $comics{$comic}{'fullName'}; + if ($comics{$comic}{'error'}) { + my %error = ('comicName' => "$comics{$comic}{'fullName'}", + 'error' => "$comics{$comic}{'error'}", + 'status' => "Error"); + push @{$json{'comics'}}, \%error; + $totalErrors += 1; + } else { + my %status = ('comicName' => "$comics{$comic}{'fullName'}", + 'error' => 0, + 'status' => "Successfull"); + push @{$json{'comics'}}, \%status; + } + } + $json{'totalErrors'} = $totalErrors; + + open SR, ">$filename" or die ("ERROR: Failed to create status report: $!\n"); + print SR create_json (\%json); + close(SR); +} + +####################################################################### +####################################################################### sub writeComic ($$) { my ( $comics, $comic, $date ) = @_; my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 ); my $indexFile = $indexDir . "/index-" . $date->{'year2'} . $date->{'mon2'} . $date->{'day2'} . "-" . $sd . ".html"; + $comics->{$comic}{'fullName'} =~ s/&/&/g; + my $content = <{$comic}{'fullName'}) ******* --> @@ -192,11 +279,22 @@ sub writeComic ($$) { $comics->{$comic}{'fullName'}     - + $comics->{$comic}{'url'}
+EOF + if ( $comics->{$comic}{'mode'} == 3 ) { + print("Mode 3\n"); + $content .= < +EOF + } else { + $content .= < +EOF + } + $content .= <
@@ -247,15 +345,13 @@ sub writeFooter { print INDEX <
- -Generated on: $sysDate
-Version: $ver
-Config Version: $comicConfigVer
-CVS: http://demandred.dyndns.org/cgi-bin/cvsweb/comics/ -

+Generated on: $sysDate
+Version: $ver
+Config Version: $comicConfigVer
+CVS: http://demandred.dyndns.org/cgi-bin/cvsweb/comics/ +
Valid XHTML 1.0 Transitional -

@@ -292,19 +388,15 @@ sub writeTitle ($$) { - - + + Daily Comics for $today - -
- - +
+ - - EOF close (INDEX); } @@ -321,30 +413,51 @@ sub directDownload ($$) { my $cDir = $date->{'mon2'} . $date->{'year2'}; my $cDate = $date->{'day2'}; - my $cmd = "wget -q $file --referer=\"" . $comics->{$comic}{'url'} ."\" --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg"; + my $cmd = "wget --no-check-certificate -q $file --referer='" . $comics->{$comic}{'url'} ."' --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg"; return system($cmd); } ####################################################################### ####################################################################### +sub linkOnly ($$) { + my ( $comics, $comic, $date ) = @_; + + return 0; +} +####################################################################### +####################################################################### sub indexDownload ($$) { my ( $comics, $comic, $date ) = @_; my ( @lines, $comicLine, $mainURL ); my $comicIndex = "indexes/index.$comic"; - my $wget_cmd = "wget -q --referer=\"$comics->{$comic}{'url'}\" " . - "--user-agent=\"$USER_AGENT\" " . + print("Getching Index $comicIndex.\n"); + print("comic url: $comics->{$comic}{'url'}\n"); + + print Dumper($comics->{$comic}); + + my $wget_cmd = "wget --referer='$comics->{$comic}{'url'}' " . + "--no-check-certificate --user-agent=\"$USER_AGENT\" " . "$comics->{$comic}{'url'} -O $comicIndex"; - system($wget_cmd); + print ("Using wget command:\n$wget_cmd\n"); + + my $status = system($wget_cmd); + + print ("Return status: $status\n"); if ( ! open FILEN, "<$comicIndex" ) { return "ERROR: Can't open index file for " . $comics->{$comic}{'fullName'} . " (" . $comics->{$comic}{'url'} . ")"; } - @lines = ; + while () { + my $line = $_; + $line =~ s/\R|\ \ +|\t//g if ( $comics->{$comic}{'remove_newlines'} ); + push @lines, $line; + } close (FILEN); + unlink ("$comicIndex"); $mainURL = $comics->{$comic}{'url'}; @@ -355,9 +468,14 @@ sub indexDownload ($$) { ## ## Find the comic strip URL based on the specified regex in the search ## + + print "Using search $comics->{$comic}{'search'}\n"; + foreach my $line (@lines) { if ( $line =~ m/$comics->{$comic}{'search'}/i ) { + print "Found match:\n"; $comicLine = $1; chomp $comicLine; + print "+ $comicLine\n"; } } @@ -368,9 +486,13 @@ sub indexDownload ($$) { my $cDate = $date->{'day2'}; if ( $comicLine ) { + print "Downloading Comic\n"; if ( $comicLine =~ m/(gif|jpg|png)/i ) { $comics->{$comic}{'ext'} = $1; } my $comicURL = ( $comicLine =~ m/http/ ) ? $comicLine : $mainURL . $comicLine; - my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer=\"" . $comics->{$comic}{'url'} . "\" -q $comicURL -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}"; + print "Final URL: $comicURL\n"; + # Strip & + $comicURL =~ s/\&\;/&/g; + my $cmd = "wget --no-check-certificate --user-agent=\"$USER_AGENT\" --referer='" . $comics->{$comic}{'url'} . "' -q '$comicURL' -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}"; system( $cmd ); return 0; }
Comic Page Heading
$today_long