comics/fetch.pl.new - view

File: [Local Repository] / comics / fetch.pl.new
Revision 1.30: download - view: text, annotated - select for diffs
Tue Oct 4 12:02:03 2022 UTC (19 months, 1 week ago) by nick
Branches: MAIN
CVS tags: HEAD

Added --no-check-certificate for wget calls as arcamax was failing its cert check.  Meh, whatever.  It's just comics.

#!/usr/bin/perl -w ############################################################################### # $Log: fetch.pl.new,v $ # Revision 1.30 2022/10/04 12:02:03 nick # Added --no-check-certificate for wget calls as arcamax was failing its cert check. Meh, whatever. It's just comics. # # Revision 1.29 2020/06/10 21:32:52 nick # Centered page # # Revision 1.28 2020/06/10 21:14:31 nick # Updated for w3 validation. # # Revision 1.27 2019/04/15 12:50:23 nick # The script was unable to handle html '&' and convert it, so I added that. I probably should see if there's a library or something that handles all those automagically but I just tossed a regex in there for now that does the trick. # # Revision 1.26 2018/04/22 14:03:54 nick # Changed the default for Sunday comics that was causing issues with some comics. # # Revision 1.25 2018/02/12 13:30:58 nick # Added an easier to compare date string to determine if the status json file was updated today and report if it wasn't. # # Revision 1.24 2018/02/06 14:31:06 nick # A status report is now generated in JSON that can easily be scanned so that # I can be alerted when there are failures that I miss if I don't read the # comics that day. # # Revision 1.23 2018/01/26 13:05:27 nick # Added a new config option to remove all newline from the resulting index.html # file. This allows for easier parsing for certain comics. I then updated # the URLs to search for and enabled the newline removal for a handful # of uComics. # # I believe I've also properly fixed the Comic Config version displayed on # the webpage itself. # # Revision 1.22 2017/12/05 13:37:40 nick # Added the CVS config version to the outpuit. # # Revision 1.21 2015/10/26 14:25:40 nick # Fixed a bug that was improperly including the day of week string preventing the weekend comics from fetching proproperly. # # Revision 1.20 2015/10/22 12:58:44 nick # Added the ability for Sunday only comics. Stonesoup is no longer weekdays, this has been added to Sunday only. I also added Foxtrot Classics for weekdays and Foxtrot for Sundays. # # Revision 1.19 2015/07/13 12:56:58 nick # Added Sally Forth and Pearls Before Swine. Adding Sally Forth required a change in the 'wget' command for fetching the index file to include 'user-agent' and 'referer'. # # Revision 1.18 2015/05/07 12:31:43 nick # Added favicon # # Revision 1.17 2015/02/19 14:56:10 nick # Fixed a problem that forced everything to JPG. This would kill GIF animations, but would not display the gifs either because 'convert' appends an index number to the end of the file name for each from of the GIF animation. I fixed this to maintain GIF compatibilty as well as rewritting how the script fetches the size of the file. Additionally, I updated the configuration for Questionable Content to search for GIF or JPG, which is what triggered this entire update. # # Revision 1.16 2015/02/05 18:05:58 nick # Changed the background and added a fancy title. # # Revision 1.15 2015/01/19 13:46:19 nick # *** empty log message *** # ############################################################################### use strict; use File::Path; use Data::Dumper; use Pod::Usage; use Getopt::Long; use JSON::Create 'create_json'; use Date::Calc qw/Date_to_Text_Long Today Day_of_Week Day_of_Week_to_Text/; use Data::Dumper; ## ## Some default values ## my $ver = '$Id: fetch.pl.new,v 1.30 2022/10/04 12:02:03 nick Exp $'; my $comicFile = "comics.conf"; my $comicConfigVer = "Unknown"; my $reportFile = "/home/httpd/html/daily/comics/status_report.json"; my %comics = &readComicConfig ( $comicFile ); my %opts = &fetchOptions( ); my $days_ago = $opts{'days'} || 0; my %dates = &fetchDates(); my $baseDir = $comics{'configs'}{'base_directory'} || "."; my $imageDir = $baseDir . "/" . ( $comics{'configs'}{'image_directory'} || "images" ) . "/$dates{'mon2'}$dates{'year2'}"; my $indexDir = $baseDir . "/" . ( $comics{'configs'}{'index_directory'} || "indexes" ); my $USER_AGENT = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18) Gecko/20110628 Ubuntu/10.10 (maverick) Firefox/3.6.18"; my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /; my $DATE=`date`; chomp $DATE; print STDOUT "Starting comic fetch at $DATE\n"; ## ## Main program starts here ## &checkDir ( [ $imageDir, $indexDir ] ); &writeTitle ( \%dates ); foreach my $comic ( sort keys %comics ) { ## Skip if this is Sunday and the comic is weekdays only next if ( $comic =~ m/config/ ); if (($dates{'wday'} eq "Sunday") && ($comics{$comic}{'not_sunday'} == 1)) { print "Skipping '$comic'; Weekdays only.\n"; next; } ## Skip if Sunday only comic and it's not Sunday. if (($dates{'wday'} ne "Sunday") && ($comics{$comic}{'sunday_only'} == 1)) { print "Skipping '$comic' ($comics{$comic}{'sunday_only'}); Sunday only.\n"; next } $comics{$comic}{'error'} = &downloadComic ( \%comics, $comic, \%dates ); &writeComic ( \%comics, $comic, \%dates ); my $file = "$imageDir/$comic-$dates{'day2'}.$comics{$comic}{'ext'}"; my $size = 0; my $cmd = "/usr/bin/identify -verbose $file|"; open(IMG, $cmd) || die ("Can't open: $!\n"); while(<IMG>) { if ($_ =~ m/^\s+geometry:\s+(\d+)x\d+.*/i) { $size = $1 if ( $size == 0); } } close(IMG); system( "/usr/bin/convert -resize 800 $file $file" ) if ( $size > 800 ) } ## &writeMainIndex ( \%dates ); &writeFooter( \%dates ); print STDOUT "Status written to $reportFile.\n" if (&writeStatusReportJSON(\%comics, $reportFile)); $DATE=`date`; chomp( $DATE ); print STDOUT "Completed comic fetch at $DATE\n"; ## End ####################################################################### ## Function : downloadComic ## ## Description : ## This function determines the download method being used to ## retrieve the comic and calls the apprioriate function. ## ## If the mode is invalid an error will be returned. ## ####################################################################### sub downloadComic ($$) { my ( $comics, $comic, $date ) = @_; SWITCH: { if ( $comics->{$comic}{'mode'} eq 1 ) { return indexDownload ( \%comics, $comic, $date ); last SWITCH; } if ( $comics->{$comic}{'mode'} eq 2 ) { return directDownload ( \%comics, $comic, $date ); last SWITCH; } } return "ERROR: Unknown download method specified for $comics->{$comic}{'fullName'}."; } ####################################################################### ####################################################################### sub readComicConfig ($$) { my ( $comicFile ) = @_; my %comicConfig = ( ); my %config = ( ); my ($year, $mon, $day) =( localtime(time))[5,4,3]; $year += 1900; $mon = sprintf("%02d", ($mon + 1)); $day = sprintf("%02d", $day); open FILEN, "<$comicFile"; while (<FILEN>) { #if ($_ =~ m/^#.* \$[Ii][Dd]: fetch.pl.new,v 1.23 2018/01/26 13:05:27 nick Exp $/) { if ($_ =~ m/^#.* \$[Ii][dD]: .*,v\ (.*)\ \d{4}\/.*\$$/) { $comicConfigVer = $1; } if ( ( $_ !~ m/^#/ ) && ( $_ =~ m/,.*,/) ){ $_ =~ s/__YEAR__/$year/g; $_ =~ s/__MON__/$mon/g; $_ =~ s/__DAY__/$day/g; my @res = split /,/, $_; $comicConfig{$res[0]}{'url'} = $res[1]; $comicConfig{$res[0]}{'search'} = $res[2]; $comicConfig{$res[0]}{'mode'} = $res[3]; $comicConfig{$res[0]}{'fullName'} = $res[4]; $comicConfig{$res[0]}{'ext'} = $res[5]; $comicConfig{$res[0]}{'not_sunday'} = sprintf("%d", $res[6] || 0); $comicConfig{$res[0]}{'sunday_only'} = sprintf("%d", $res[7] || 0); $comicConfig{$res[0]}{'remove_newlines'} = sprintf("%d", $res[8] || 0); $comicConfig{$res[0]}{'error'} = 0; } elsif ( $_ =~ m/(.*)\s+=\s+(.*)/ ) { $comicConfig{'configs'}{$1} = $2; } } close (FILEN); return %comicConfig; } ####################################################################### ####################################################################### sub writeStatusReportJSON ($$) { my ( $comicsRef, $filename ) = @_; my %comics = %$comicsRef; my $shortDate = sprintf("%d%02d%02d", (localtime)[5] + 1900, (localtime)[4] + 1, (localtime)[3]); my %json = ('date' => $shortDate, 'comics' => ()); my $totalErrors = 0; foreach my $comic (sort keys %comics) { next unless $comics{$comic}{'fullName'}; if ($comics{$comic}{'error'}) { my %error = ('comicName' => "$comics{$comic}{'fullName'}", 'error' => "$comics{$comic}{'error'}", 'status' => "Error"); push @{$json{'comics'}}, \%error; $totalErrors += 1; } else { my %status = ('comicName' => "$comics{$comic}{'fullName'}", 'error' => 0, 'status' => "Successfull"); push @{$json{'comics'}}, \%status; } } $json{'totalErrors'} = $totalErrors; open SR, ">$filename" or die ("ERROR: Failed to create status report: $!\n"); print SR create_json (\%json); close(SR); } ####################################################################### ####################################################################### sub writeComic ($$) { my ( $comics, $comic, $date ) = @_; my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 ); my $indexFile = $indexDir . "/index-" . $date->{'year2'} . $date->{'mon2'} . $date->{'day2'} . "-" . $sd . ".html"; $comics->{$comic}{'fullName'} =~ s/&/&/g; my $content = <<EOF;  <tr> <td align="left"> $comics->{$comic}{'fullName'}     <a href="$comics->{$comic}{'url'}"> $comics->{$comic}{'url'} </a> <img src="../images/$date->{'mon2'}$date->{'year2'}/$comic-$date->{'day2'}.$comics->{$comic}{'ext'}" alt="$comic-$date->{'day2'}" /> </td></tr>  EOF open INDEX, ">>$indexFile"; print INDEX $content if ( ! $comics->{$comic}{'error'} ); print INDEX <<EOF $comics->{$comic}{'fullName'}     < <a href="$comics->{$comic}{'url'}"> $comics->{$comic}{'url'} </a> $comic : $comics->{$comic}{'error'} </td> </tr> EOF if ( $comics->{$comic}{'error'} ); close (INDEX); return 0; } ####################################################################### ####################################################################### sub writeMainIndex ($$) { my ( $date ) = @_; } ####################################################################### ####################################################################### sub writeFooter { my ( $date ) = @_; my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 ); my $indexFile = $indexDir . "/index-" . $date->{'year2'} . $date->{'mon2'} . $date->{'day2'} . "-" . $sd . ".html"; my $sysDate = `date`; open INDEX, ">>$indexFile"; print INDEX <<EOF; </table> <center> Generated on: $sysDate Version: $ver Config Version: $comicConfigVer CVS: <a href="http://demandred.dyndns.org:3000/cgi-bin/cvsweb/comics/">http://demandred.dyndns.org/cgi-bin/cvsweb/comics/</a> <a href="http://validator.w3.org/check?uri=referer"><img src="http://www.w3.org/Icons/valid-xhtml10-blue" alt="Valid XHTML 1.0 Transitional" height="31" width="88" border="0" /></a> </center> </body> </html> EOF close( INDEX ); } ####################################################################### ####################################################################### sub checkDir ($$) { my @dir = @_; foreach ( @dir ) { if ( ! -d $_ ) { mkpath( $_ ); } } } ####################################################################### ####################################################################### sub writeTitle ($$) { my ( $date ) = @_; my $sd = substr( join( '', $days[$date->{'dow'}] ), 0, 3 ); my $indexFile = $indexDir . "/index-" . $date->{'year2'} . $date->{'mon2'} . $date->{'day2'} . "-" . $sd . ".html"; my $today = $days[$date->{'dow'}] . " " . $date->{'mon'} . "/" . $date->{'day'} . "/" . $date->{'year'}; my $today_long = Date_to_Text_Long(Today()); open INDEX, ">$indexFile"; print INDEX <<EOF; <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html;charset=utf-8" /> <link href="/daily/comics/comics.css" type="text/css" rel="stylesheet" media="screen" /> <link rel="shortcut icon" href="./favicon.ico" /> <title>Daily Comics for $today</title> </head> <body bgcolor="#FFFFFF"> <table align="center" cellpadding="0" cellspacing="0" border="0"> <tr><td align="left"><img src="images/daily_comics_heading01.png" alt="Comic Page Heading" /></td></tr> <tr><td align="left">$today_long</td></tr> <tr><td> </td></tr> EOF close (INDEX); } ####################################################################### ####################################################################### sub directDownload ($$) { my ( $comics, $comic, $date ) = @_; my $file = &parseComic ( $comics, $comic, $date ); ## ## Save the file to the appropriate directory ## my $cDir = $date->{'mon2'} . $date->{'year2'}; my $cDate = $date->{'day2'}; my $cmd = "wget --no-check-certificate -q $file --referer='" . $comics->{$comic}{'url'} ."' --user-agent=\"$USER_AGENT\" -O - | /usr/bin/convert - jpeg:images/$cDir/$comic-$cDate.jpg"; return system($cmd); } ####################################################################### ####################################################################### sub indexDownload ($$) { my ( $comics, $comic, $date ) = @_; my ( @lines, $comicLine, $mainURL ); my $comicIndex = "indexes/index.$comic"; print("Getching Index $comicIndex.\n"); print("comic url: $comics->{$comic}{'url'}\n"); print Dumper($comics->{$comic}); my $wget_cmd = "wget --referer='$comics->{$comic}{'url'}' " . "--no-check-certificate --user-agent=\"$USER_AGENT\" " . "$comics->{$comic}{'url'} -O $comicIndex"; print ("Using wget command:\n$wget_cmd\n"); my $status = system($wget_cmd); print ("Return status: $status\n"); if ( ! open FILEN, "<$comicIndex" ) { return "ERROR: Can't open index file for " . $comics->{$comic}{'fullName'} . " (" . $comics->{$comic}{'url'} . ")"; } while (<FILEN>) { my $line = $_; $line =~ s/\R|\ \ +|\t//g if ( $comics->{$comic}{'remove_newlines'} ); push @lines, $line; } close (FILEN); unlink ("$comicIndex"); $mainURL = $comics->{$comic}{'url'}; ## I need to figure out how to merge these two in to one regex. $mainURL =~ s/(http:\/\/.*)(?:\/.*\/){1,}.*/$1/; $mainURL =~ s/([a-z])\/.*/$1/i; ## ## Find the comic strip URL based on the specified regex in the search ## foreach my $line (@lines) { if ( $line =~ m/$comics->{$comic}{'search'}/i ) { $comicLine = $1; chomp $comicLine; } } ## ## Save the file to the appropriate directory ## my $cDir = $date->{'mon2'} . $date->{'year2'}; my $cDate = $date->{'day2'}; if ( $comicLine ) { if ( $comicLine =~ m/(gif|jpg|png)/i ) { $comics->{$comic}{'ext'} = $1; } my $comicURL = ( $comicLine =~ m/http/ ) ? $comicLine : $mainURL . $comicLine; # Strip & $comicURL =~ s/\&amp\;/&/g; my $cmd = "wget --no-check-certificate --user-agent=\"$USER_AGENT\" --referer='" . $comics->{$comic}{'url'} . "' -q '$comicURL' -O images/$cDir/$comic-$cDate.$comics->{$comic}{'ext'}"; system( $cmd ); return 0; } unlink "index.html"; return "ERROR: Could not download comic $comics->{$comic}{'fullName'}"; } ####################################################################### ####################################################################### sub parseComic ($$) { my ( $comics, $comic, $date ) = @_; my $string = $comics->{$comic}{'search'}; $string =~ s/__year__/$date->{'year'}/g; $string =~ s/__year2__/$date->{'year2'}/g; $string =~ s/__mon__/$date->{'mon'}/g; $string =~ s/__mon2__/$date->{'mon2'}/g; $string =~ s/__day__/$date->{'day'}/g; $string =~ s/__day2__/$date->{'day2'}/g; $string =~ s/__ext__/$comics->{$comic}{'ext'}/g; chomp $string; return $string; } ####################################################################### ####################################################################### sub fetchDates () { my %dates = (); ($dates{'day'}, $dates{'mon'}, $dates{'year'}, $dates{'dow'}) = (localtime(time - (86400 * $days_ago )))[3,4,5,6]; $dates{'year'} += 1900; $dates{'year2'} = substr $dates{'year'}, 2, 2; $dates{'day2'} = ( $dates{'day'} < 10 ) ? "0" . $dates{'day'} : $dates{'day'}; $dates{'mon'}++; $dates{'mon2'} = ( $dates{'mon'} < 10 ) ? "0".$dates{'mon'} : $dates{'mon'}; my @days = qw/ Sunday Monday Tuesday Wednesday Thursday Friday Saturday /; $dates{'wday'} = $days[$dates{'dow'}]; return %dates; } ############################################################################### ## ## &fetchOptions( ); ## ## Grab our command line arguments and toss them in to a hash ## ############################################################################### sub fetchOptions { my %opts; &GetOptions( "days:i" => \$opts{'days'}, "help|?" => \$opts{'help'}, "man" => \$opts{'man'}, ) || &pod2usage( ); &pod2usage( ) if defined $opts{'help'}; &pod2usage( { -verbose => 2, -input => \*DATA } ) if defined $opts{'man'}; return %opts; } __END__ =head1 NAME fetch.pl - Fetches comics and places them all locally in a single html file. =head1 SYNOPSIS fetch.pl [options] Options: --days,d Fetch comics from X days ago --help,? Display the basic help menu --man,m Display the detailed man page =head1 DESCRIPTION =head1 HISTORY =head1 AUTHOR Nicholas DeClario <nick@declario.com> =head1 BUGS This is a work in progress. Please report all bugs to the author. =head1 SEE ALSO =head1 COPYRIGHT =cut