--- comics/fetch.pl.new 2018/04/22 14:03:54 1.26 +++ comics/fetch.pl.new 2019/04/15 12:50:23 1.27 @@ -2,6 +2,9 @@ ############################################################################### # $Log: fetch.pl.new,v $ +# Revision 1.27 2019/04/15 12:50:23 nick +# The script was unable to handle html '&' and convert it, so I added that. I probably should see if there's a library or something that handles all those automagically but I just tossed a regex in there for now that does the trick. +# # Revision 1.26 2018/04/22 14:03:54 nick # Changed the default for Sunday comics that was causing issues with some comics. # @@ -59,7 +62,7 @@ use Date::Calc qw/Date_to_Text_Long Toda ## ## Some default values ## -my $ver = '$Id: fetch.pl.new,v 1.26 2018/04/22 14:03:54 nick Exp $'; +my $ver = '$Id: fetch.pl.new,v 1.27 2019/04/15 12:50:23 nick Exp $'; my $comicFile = "comics.conf"; my $comicConfigVer = "Unknown"; my $reportFile = "/home/httpd/html/daily/comics/status_report.json"; @@ -211,7 +214,7 @@ sub writeStatusReportJSON ($$) { my $shortDate = sprintf("%d%02d%02d", (localtime)[5] + 1900, (localtime)[4] + 1, (localtime)[3]); - my %json = ('date' => $shortDate, 'comics' => []); + my %json = ('date' => $shortDate, 'comics' => ()); my $totalErrors = 0; foreach my $comic (sort keys %comics) { @@ -220,13 +223,13 @@ sub writeStatusReportJSON ($$) { my %error = ('comicName' => "$comics{$comic}{'fullName'}", 'error' => "$comics{$comic}{'error'}", 'status' => "Error"); - push $json{'comics'}, \%error; + push @{$json{'comics'}}, \%error; $totalErrors += 1; } else { my %status = ('comicName' => "$comics{$comic}{'fullName'}", 'error' => 0, 'status' => "Successfull"); - push $json{'comics'}, \%status; + push @{$json{'comics'}}, \%status; } } $json{'totalErrors'} = $totalErrors; @@ -380,7 +383,7 @@ sub directDownload ($$) { my $cDir = $date->{'mon2'} . $date->{'year2'}; my $cDate = $date->{'day2'}; - my $cmd = "wget -q $file --referer=\"" . $comics->{$comic}{'url'} ."\" --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg"; + my $cmd = "wget -q $file --referer='" . $comics->{$comic}{'url'} ."' --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg"; return system($cmd); } @@ -392,7 +395,7 @@ sub indexDownload ($$) { my ( @lines, $comicLine, $mainURL ); my $comicIndex = "indexes/index.$comic"; - my $wget_cmd = "wget -q --referer=\"$comics->{$comic}{'url'}\" " . + my $wget_cmd = "wget -q --referer='$comics->{$comic}{'url'}' " . "--user-agent=\"$USER_AGENT\" " . "$comics->{$comic}{'url'} -O $comicIndex"; system($wget_cmd); @@ -403,11 +406,12 @@ sub indexDownload ($$) { } while () { my $line = $_; - $line =~ s/\R|\ \ +|\t//g if ( $comics->{$comic}{'remove_newliens'} ); + $line =~ s/\R|\ \ +|\t//g if ( $comics->{$comic}{'remove_newlines'} ); push @lines, $line; } close (FILEN); + unlink ("$comicIndex"); $mainURL = $comics->{$comic}{'url'}; @@ -418,6 +422,7 @@ sub indexDownload ($$) { ## ## Find the comic strip URL based on the specified regex in the search ## + foreach my $line (@lines) { if ( $line =~ m/$comics->{$comic}{'search'}/i ) { $comicLine = $1; chomp $comicLine; @@ -433,7 +438,9 @@ sub indexDownload ($$) { if ( $comicLine ) { if ( $comicLine =~ m/(gif|jpg|png)/i ) { $comics->{$comic}{'ext'} = $1; } my $comicURL = ( $comicLine =~ m/http/ ) ? $comicLine : $mainURL . $comicLine; - my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer=\"" . $comics->{$comic}{'url'} . "\" -q $comicURL -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}"; + # Strip & + $comicURL =~ s/\&\;/&/g; + my $cmd = "wget --user-agent=\"$USER_AGENT\" --referer='" . $comics->{$comic}{'url'} . "' -q '$comicURL' -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}"; system( $cmd ); return 0; }