--- comics/fetch.pl.new 2015/10/22 12:58:44 1.20
+++ comics/fetch.pl.new 2020/06/10 21:14:31 1.28
@@ -2,6 +2,38 @@
###############################################################################
# $Log: fetch.pl.new,v $
+# Revision 1.28 2020/06/10 21:14:31 nick
+# Updated for w3 validation.
+#
+# Revision 1.27 2019/04/15 12:50:23 nick
+# The script was unable to handle html '&' and convert it, so I added that. I probably should see if there's a library or something that handles all those automagically but I just tossed a regex in there for now that does the trick.
+#
+# Revision 1.26 2018/04/22 14:03:54 nick
+# Changed the default for Sunday comics that was causing issues with some comics.
+#
+# Revision 1.25 2018/02/12 13:30:58 nick
+# Added an easier to compare date string to determine if the status json file was updated today and report if it wasn't.
+#
+# Revision 1.24 2018/02/06 14:31:06 nick
+# A status report is now generated in JSON that can easily be scanned so that
+# I can be alerted when there are failures that I miss if I don't read the
+# comics that day.
+#
+# Revision 1.23 2018/01/26 13:05:27 nick
+# Added a new config option to remove all newline from the resulting index.html
+# file. This allows for easier parsing for certain comics. I then updated
+# the URLs to search for and enabled the newline removal for a handful
+# of uComics.
+#
+# I believe I've also properly fixed the Comic Config version displayed on
+# the webpage itself.
+#
+# Revision 1.22 2017/12/05 13:37:40 nick
+# Added the CVS config version to the outpuit.
+#
+# Revision 1.21 2015/10/26 14:25:40 nick
+# Fixed a bug that was improperly including the day of week string preventing the weekend comics from fetching proproperly.
+#
# Revision 1.20 2015/10/22 12:58:44 nick
# Added the ability for Sunday only comics. Stonesoup is no longer weekdays, this has been added to Sunday only. I also added Foxtrot Classics for weekdays and Foxtrot for Sundays.
#
@@ -27,14 +59,16 @@ use File::Path;
use Data::Dumper;
use Pod::Usage;
use Getopt::Long;
-
-use Date::Calc qw/Date_to_Text_Long Today/;
+use JSON::Create 'create_json';
+use Date::Calc qw/Date_to_Text_Long Today Day_of_Week Day_of_Week_to_Text/;
##
## Some default values
##
-my $ver = '$Id: fetch.pl.new,v 1.20 2015/10/22 12:58:44 nick Exp $';
+my $ver = '$Id: fetch.pl.new,v 1.28 2020/06/10 21:14:31 nick Exp $';
my $comicFile = "comics.conf";
+my $comicConfigVer = "Unknown";
+my $reportFile = "/home/httpd/html/daily/comics/status_report.json";
my %comics = &readComicConfig ( $comicFile );
my %opts = &fetchOptions( );
my $days_ago = $opts{'days'} || 0;
@@ -60,19 +94,19 @@ foreach my $comic ( sort keys %comics )
## Skip if this is Sunday and the comic is weekdays only
next if ( $comic =~ m/config/ );
- if (($dates{'day2'} eq "Sunday") &&
- ($comics{$comic}{'sunday'} == 0)) {
+ if (($dates{'wday'} eq "Sunday") &&
+ ($comics{$comic}{'not_sunday'} == 1)) {
print "Skipping '$comic'; Weekdays only.\n";
next;
}
## Skip if Sunday only comic and it's not Sunday.
- if (($dates{'day2'} ne "Sunday") &&
+ if (($dates{'wday'} ne "Sunday") &&
($comics{$comic}{'sunday_only'} == 1)) {
- print "Skipping '$comic'; Sunday only.\n";
+ print "Skipping '$comic' ($comics{$comic}{'sunday_only'}); Sunday only.\n";
next
}
-
+
$comics{$comic}{'error'} = &downloadComic ( \%comics, $comic, \%dates );
&writeComic ( \%comics, $comic, \%dates );
@@ -97,6 +131,9 @@ foreach my $comic ( sort keys %comics )
&writeFooter( \%dates );
+print STDOUT "Status written to $reportFile.\n"
+ if (&writeStatusReportJSON(\%comics, $reportFile));
+
$DATE=`date`; chomp( $DATE );
print STDOUT "Completed comic fetch at $DATE\n";
@@ -143,6 +180,10 @@ sub readComicConfig ($$) {
open FILEN, "<$comicFile";
while () {
+ #if ($_ =~ m/^#.* \$[Ii][Dd]: fetch.pl.new,v 1.23 2018/01/26 13:05:27 nick Exp $/) {
+ if ($_ =~ m/^#.* \$[Ii][dD]: .*,v\ (.*)\ \d{4}\/.*\$$/) {
+ $comicConfigVer = $1;
+ }
if ( ( $_ !~ m/^#/ ) && ( $_ =~ m/,.*,/) ){
$_ =~ s/__YEAR__/$year/g;
$_ =~ s/__MON__/$mon/g;
@@ -154,8 +195,9 @@ sub readComicConfig ($$) {
$comicConfig{$res[0]}{'mode'} = $res[3];
$comicConfig{$res[0]}{'fullName'} = $res[4];
$comicConfig{$res[0]}{'ext'} = $res[5];
- $comicConfig{$res[0]}{'sunday'} = $res[6] || 1;
- $comicConfig{$res[0]}{'sunday_only'} = $res[7] || 0;
+ $comicConfig{$res[0]}{'not_sunday'} = sprintf("%d", $res[6] || 0);
+ $comicConfig{$res[0]}{'sunday_only'} = sprintf("%d", $res[7] || 0);
+ $comicConfig{$res[0]}{'remove_newlines'} = sprintf("%d", $res[8] || 0);
$comicConfig{$res[0]}{'error'} = 0;
}
elsif ( $_ =~ m/(.*)\s+=\s+(.*)/ ) {
@@ -169,12 +211,46 @@ sub readComicConfig ($$) {
#######################################################################
#######################################################################
+sub writeStatusReportJSON ($$) {
+ my ( $comicsRef, $filename ) = @_;
+ my %comics = %$comicsRef;
+ my $shortDate = sprintf("%d%02d%02d", (localtime)[5] + 1900,
+ (localtime)[4] + 1,
+ (localtime)[3]);
+ my %json = ('date' => $shortDate, 'comics' => ());
+ my $totalErrors = 0;
+
+ foreach my $comic (sort keys %comics) {
+ next unless $comics{$comic}{'fullName'};
+ if ($comics{$comic}{'error'}) {
+ my %error = ('comicName' => "$comics{$comic}{'fullName'}",
+ 'error' => "$comics{$comic}{'error'}",
+ 'status' => "Error");
+ push @{$json{'comics'}}, \%error;
+ $totalErrors += 1;
+ } else {
+ my %status = ('comicName' => "$comics{$comic}{'fullName'}",
+ 'error' => 0,
+ 'status' => "Successfull");
+ push @{$json{'comics'}}, \%status;
+ }
+ }
+ $json{'totalErrors'} = $totalErrors;
+
+ open SR, ">$filename" or die ("ERROR: Failed to create status report: $!\n");
+ print SR create_json (\%json);
+ close(SR);
+}
+
+#######################################################################
+#######################################################################
sub writeComic ($$) {
my ( $comics, $comic, $date ) = @_;
my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 );
my $indexFile = $indexDir . "/index-" . $date->{'year2'} .
$date->{'mon2'} . $date->{'day2'} . "-" .
$sd . ".html";
+ $comics->{$comic}{'fullName'} =~ s/&/&/g;
my $content = <{$comic}{'fullName'}) ******* -->
@@ -237,14 +313,13 @@ sub writeFooter {
print INDEX <
-
-Generated on: $sysDate
-Version: $ver
-CVS: http://demandred.dyndns.org/cgi-bin/cvsweb/comics/
-
+Generated on: $sysDate
+Version: $ver
+Config Version: $comicConfigVer
+CVS: http://demandred.dyndns.org/cgi-bin/cvsweb/comics/
+
-
-
-
- |
+ |
$today_long |
|
-
-
EOF
close (INDEX);
}
@@ -310,7 +381,7 @@ sub directDownload ($$) {
my $cDir = $date->{'mon2'} . $date->{'year2'};
my $cDate = $date->{'day2'};
- my $cmd = "wget -q $file --referer=\"" . $comics->{$comic}{'url'} ."\" --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg";
+ my $cmd = "wget -q $file --referer='" . $comics->{$comic}{'url'} ."' --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg";
return system($cmd);
}
@@ -322,7 +393,7 @@ sub indexDownload ($$) {
my ( @lines, $comicLine, $mainURL );
my $comicIndex = "indexes/index.$comic";
- my $wget_cmd = "wget -q --referer=\"$comics->{$comic}{'url'}\" " .
+ my $wget_cmd = "wget -q --referer='$comics->{$comic}{'url'}' " .
"--user-agent=\"$USER_AGENT\" " .
"$comics->{$comic}{'url'} -O $comicIndex";
system($wget_cmd);
@@ -331,9 +402,14 @@ sub indexDownload ($$) {
return "ERROR: Can't open index file for " . $comics->{$comic}{'fullName'} .
" (" . $comics->{$comic}{'url'} . ")";
}
- @lines = ;
+ while () {
+ my $line = $_;
+ $line =~ s/\R|\ \ +|\t//g if ( $comics->{$comic}{'remove_newlines'} );
+ push @lines, $line;
+ }
close (FILEN);
+
unlink ("$comicIndex");
$mainURL = $comics->{$comic}{'url'};
@@ -344,6 +420,7 @@ sub indexDownload ($$) {
##
## Find the comic strip URL based on the specified regex in the search
##
+
foreach my $line (@lines) {
if ( $line =~ m/$comics->{$comic}{'search'}/i ) {
$comicLine = $1; chomp $comicLine;
@@ -359,7 +436,9 @@ sub indexDownload ($$) {
if ( $comicLine ) {
if ( $comicLine =~ m/(gif|jpg|png)/i ) { $comics->{$comic}{'ext'} = $1; }
my $comicURL = ( $comicLine =~ m/http/ ) ? $comicLine : $mainURL . $comicLine;
- my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer=\"" . $comics->{$comic}{'url'} . "\" -q $comicURL -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}";
+ # Strip &
+ $comicURL =~ s/\&\;/&/g;
+ my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer='" . $comics->{$comic}{'url'} . "' -q '$comicURL' -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}";
system( $cmd );
return 0;
}
@@ -399,6 +478,8 @@ sub fetchDates () {
$dates{'day2'} = ( $dates{'day'} < 10 ) ? "0" . $dates{'day'} : $dates{'day'};
$dates{'mon'}++;
$dates{'mon2'} = ( $dates{'mon'} < 10 ) ? "0".$dates{'mon'} : $dates{'mon'};
+ my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /;
+ $dates{'wday'} = $days[$dates{'dow'}];
return %dates;
}
|