#!/usr/bin/perl -w # # www.scan # # Scan the WWW space. # # This looks for a few types of web errors. The basic error # it looks for is links to files which don't exist. # # This isn't pretty or fast. # # Every user who has entries linked in can have a file named # www.scan.exceptions # in their web directory. This file can contain URLs which they # never want to see an error about again. Of course, this is # a manual override. Note that this file can contain system URLs # or URLs for other users. # # # Future directions: # ------------------ # # Detect the framer on wnylc URLs, and strip that portion off. # # Be smarter about caching all of the names within a file so # the files don't need to be re-read. Note that off site URLs # should be read (rather then just a HEAD), and the appropriate # tags cached. However, don't chase down URLs from these pages. # # It would be nice to use the LWP::Simple and friends, but the # normal return information is insufficient to give good messages. # In addition, we don't want to follow redirects. That said, we # can probably use this more than we are doing now. # use strict; ###use LWP::Simple; use Socket; use Getopt::Long; use vars qw(%Args %ConnectFailed $Error %Exception $FirstLine $FirstTime %Headers $HighWater $Lasturl $LocalLinks $NErrors $Nlinks %Processed $Readstat %Referenced $RemoteLinks $Report %SeenBefore $TempFile @Unchecked $Version); $Version = 'www.scan v0.17 regan@armoredpenguin.com'; %Args = ( # Boolean variables 'checkuser' => 0, # Check user pages 'debug' => 0, # Debug output 'dumpurls' => 0, # Dump URLs found on a single pg 'dumpexcept' => 0, # Dump Exceptions 'traceurls' => 0, # Trace how URLs are added # Numeric limits 'depth' => 5000, # Depth to search 'maxurl' => 200, # Max characters in a URL 'timeout' => 1200, # Timeout (in seconds) 'delay' => 0, # Time (in seconds) between probes # Site reference 'website' => 'localhost', # Site to check 'webroot' => '/', # Root of site to check 'site' => '', # Composite site/root 'refs' => '', # Starting references to add 'exceptions' => '', # Comma separated list of # exception URLs # Other string parameters 'email' => "", # Send reports out here 'sendmail' => '/usr/lib/sendmail', # Sendmail binary ); ### ### Main program ### my($key, $time, $url, @urls); $time = time(); $Report = ""; $NErrors = $Nlinks = $LocalLinks = $RemoteLinks = $HighWater = 0; GetOptions(\%Args, "debug!", "dumpurls!", "checkuser!", "dumpexcept!", "website=s", "webroot=s", "site=s", "refs=s", "exceptions=s", "depth=i", "email=s", "sendmail=s", "maxurl=i", "timeout=i", "traceurls!"); if ($Args{'site'} =~ m#http://([^/]+)(.*)#) { ($Args{'website'}, $Args{'webroot'}) = ($1, $2); } $Args{'site'} = "http://$Args{'website'}$Args{'webroot'}"; $Args{'checkuser'} = 1 if ($Args{'site'} =~ m#/~#); $Args{'site'} =~ s#/[^/]*\.html?$#/#; print qq|Site is "$Args{'website'}", root is "$Args{'webroot'}"\n|; # Process all exception sites @urls = split(/,/, $Args{'exceptions'}) if ($Args{'exceptions'} ne ""); for (@urls) { read_exceptions($_); } # Process all of the references to add @urls = split(/,/, $Args{'refs'}) if ($Args{'refs'} ne ""); for (@urls) { add_reference($_, "Root"); } ### ### Find unlinked files ### $| = 1; $FirstTime = 1; $TempFile = "/tmp/www.scan.$$"; # Temp file name $Error = 0; add_reference("http://$Args{'website'}$Args{'webroot'}", "Root"); # $url = "http://$Args{'website'}$Args{'webroot'}"; # push(@Unchecked, $url); while (@Unchecked) { $url = pop(@Unchecked); next if (!$Args{'checkuser'} && $url =~ /~/); # Do not check user pages read_exceptions("$1/www.scan.exceptions") if ($url =~ m#^([^~]*~[^/]+)#); next if ($Exception{$url}); if ($url =~ /\.\.\./) { print qq|Compare "http://www.peak.org/~regan/cgi/..." to "$url"\n|; print "http://www.peak.org/~regan/cgi/..." eq $url; print "\n"; print "We have a match to http://www.peak.org/~regan/cgi/...\n"; print "Exception is $Exception{'http://www.peak.org/~regan/cgi/...'}\n"; } check_file($url); if ($Args{'dumpurls'}) { print "URLs discovered in $url\n"; for $url (@Unchecked) { print "\t$url\n"; } print "See the file processed at $TempFile\n"; exit 0; } @Unchecked = sort(@Unchecked); sleep($Args{'delay'}) if ($Args{'delay'} > 0); } ### ### Send email report ### if ($Args{'email'} ne "") { if (!open(MAIL, "| $Args{'sendmail'} -t")) { print STDERR "Cannot open $Args{'sendmail'}: $!\n"; exit 1; } print MAIL "From: www.scan\n"; print MAIL "To: $Args{'email'}\n"; print MAIL "Subject: $Args{'site'} www.scan results\n"; print MAIL "\n"; print MAIL "The link checker program www.scan was started with " . "the following parameters:\n"; for $key (sort keys %Args) { printf MAIL "\t%-15.15s %s\n", $key, $Args{$key}; } print MAIL "\n\n"; if ($Args{'dumpexcept'}) { print MAIL "Exceptions:\n"; for $key (sort keys %Exception) { print MAIL "\t$key\n"; } print MAIL "\n\n"; } print MAIL "Information on how to interpret and use this output\n"; print MAIL "can be found at:\n"; print MAIL "\thttp://www.armoredpenguin.com/www.scan/interpret.html\n\n"; print MAIL "Items found to be checked:\n"; if ($Report eq "") { print MAIL "No broken links were found.\n"; } else { print MAIL $Report; } print MAIL "\n\n"; print MAIL "$HighWater entries in the search queue at the worst case.\n"; print MAIL "$Nlinks checked, $LocalLinks local, $RemoteLinks remote.\n"; print MAIL "$NErrors errors.\n"; $time = time() - $time; printf MAIL "Execution time: %d:%02d:%02d\n", int($time / 3600), int(($time % 3600) / 60), $time % 60; print MAIL "\n"; print MAIL "\t\t$Version\n"; print MAIL "\t\thttp://www.armoredpenguin.com/www.scan/\n"; } unlink($TempFile); exit ($Error); ### ### add_reference ### ### We have a URL. Add it in to be searched if it hasn't been dealt ### with before. ### ### It is possible for a single URL to be placed on the Unchecked list ### multiple times, but it will only be processed once. ### sub add_reference { my($url, $from) = @_; # $url =~ s#/$##; # Any URL with a "?" has problems in that we cannot trust how to # expand URLs (at least in a simple fashion). So trim these URLs. $url =~ s/\?.*// if ($from ne "Root"); # Some files are obviously not URLs. Punt on these. return if ($url =~ /\.(wc|lc|wh2)$/); # We are generating bogus addresses. At least catch them. # !!! # if ($url =~ m#/index/# || $url =~ m#/index$#) # { # $Report .= qq|Bad address "$url" from "$from"\n|; # print qq|Bad address "$url" from "$from"\n|; # return; # } # If we have 2 slashes in a row, something is probably broken if ($url =~ m#http://.*//#) { print STDERR "Bad url: $url from $from\n"; return; } # If we have already processed this file, don't do it again return if ($Processed{$url}); return if ($SeenBefore{$url}); $SeenBefore{$url} = 1; print "Adding reference of $url from $from\n" if ($Args{'traceurls'}); # If this URL is too long, get rid of it return if (length($url) > $Args{'maxurl'}); #print "Add reference to $url from $from\n"; # Save at least one place that this URL is referenced from # This could be a list of all references. However, that is harder # to do, and when the file is processed (and any error message given), # it may not have accumulated all references yet. $Referenced{$url} = $from; # Put this on the list to be processed while ($url =~ m#/\./#) { $url =~ s#/\./#/#g; } push(@Unchecked, $url) if (scalar(@Unchecked) < $Args{'depth'}); } ### ### canon_url ### ### Make a URL canonical ### sub canon_url { my($url, $from) = @_; my($base, $user, $fname, $new, $oldf); #print "Canon_url: $url from $from\n"; my($orig); $orig = $url; # At WNYLC they use a prefix URL for offsite references. We want # the real reference, so clean up the URL. if ($url =~ m#\Qhttp://www.wnylc.net/web/news/frame-click.asp?url=#) { print "Convert $url to "; $url =~ s#\Qhttp://www.wnylc.net/web/news/frame-click.asp?url=##; print "$url\n"; } # Remove query string from URL $url =~ s/\?.*//; # Remove ./ from URL while ($url =~ m#/\./#) { $url =~ s#/\./#/#g } $url =~ s/^\s*//; return "" if ($url =~ /^mailto:/ || $url =~ /^wais:/ || $url =~ /^file:/ || $url =~ /^gopher:/ || $url =~ /^news:/ || $url =~ /^news:/); return $url if ($url =~ /^http:/); return $url if ($url =~ /[a-z]+:/i); # hack # It is in a directory relative to the current file. $base = $from; $base =~ s#(^http://[^/]+)(.*)##i; ($base, $fname) = ($1, $2); # If there is a user name, deal with it $user = ""; $fname =~ s#^/##; if ($fname =~ m#(^~[^/]*)#) { $user = "/" . $1; $fname =~ s#^~[^/]*##; } # Remove the last segment of the fname # $fname =~ s#/$##; # Remove trailing slash $fname = "/$fname" if ($fname !~ m#^/#); # Add leading slash $fname =~ s#(/[^/]*$)##; # Remove slash followed by non-slash chars $oldf = $1; if ($url =~ /^\#/) { $url = $oldf . $url; $url =~ s#^/##; } # Make all pieces have the appropriate slashes on them $fname = "/" . $fname if ($fname !~ m#^/#); $fname =~ s#/$##; # If the URL is absolute, kill the fname and user if ($url =~ m#^/#) { $user = ""; $fname = ""; $url =~ s#^/##; } $new = $base . $user . $fname . "/" . $url; #print "Translate $url and $from to base=$base, user=$user, fname=$fname, url=$url ($new) "; while ($new =~ m#/\.\./#) { $new =~ s#[^/]*/\.\./##; } if ($new =~ m#/\.\.$#) { $new =~ s#[^/]*/\.\.$##; } #print "resulting in $new\n"; $Report .= qq|Canon: start with "$orig", end with "$new"\n| if ($new =~ m#/index$#); print qq|Canon: start with "$orig" from "$from", end with "$new"\n| if ($Args{'debug'}); return $new; } ### ### check_file ### ### See if the file exists. If it does, add entries to the unchecked ### list. If it doesn't exist, then display an error. ### ### The URL here should be canonical. ### sub check_file { my($url) = @_; my($msg, $site, $old_url, $readstat, $size); # If we have processed this file already, don't do it again return 1 if ($Processed{$url}); $Nlinks++; if ($url =~ /\Q$Args{'website'}/) { # print "$url is a local link\n"; $LocalLinks++; } else { # print "$url is a remote link\n"; $RemoteLinks++; } $Processed{$url} = 1; $old_url = $url; $url =~ s/X-img-src/http/; # print time() . " "; $size = scalar(@Unchecked); print qq|$size Checking file "$url"\n|; $HighWater = $size if ($size > $HighWater); $url =~ s/#.*//; # Remove internal references for the fetch return if ($url eq ""); return unless ($url =~ m#^http://#); $readstat = HTMLreadTimeout($url, $TempFile); #print "Read status is $readstat\n"; if ($readstat eq "OK") { # Do nothing. The file exists, but we didn't get the contents. } elsif ($readstat ne "BAD" && $readstat ne "" && good_file($TempFile)) { # Good # print "$url was found\n"; $site = $url; $site =~ s#^[^:]*\://##; $site =~ s#/.*##; if ($site eq $Args{'website'} && $old_url =~ /http:/ && ($old_url !~ /\.(h|wh|wc)$/)) { parse_html($old_url, $TempFile) if ($url =~ /^http:/i); } } elsif ($FirstLine !~ /Cannot connect/) { # Missing $FirstLine = "(new location at $Headers{'Location'})" if (defined($Headers{'Location'})); $msg = "\t Missing file: \"$url\" $FirstLine\n"; print "\treferenced from: $Referenced{$old_url}\n" if (defined($Referenced{$old_url})); print $msg; $Report .= $msg; $NErrors++; $Error = 1; } ### unlink($TempFile); } ### ### good_file ### ### If the file returned says that the file is bad, say that it is bad. ### sub good_file { my($fname) = @_; my($newurl); open(DATA, $fname) || die "Cannot open $fname"; $_ = ; close DATA; return 0 if (/404 Not Found/ || /302/); return 1; } ### ### parse_html ### ### Parse an HTML file. Put further references in for any references found. ### Make all references canonical. ### ### Find HREF and IMG SRC. ### sub parse_html { my($url, $fname) = @_; my($data, $msg, $newurl, $tag, $old_url); $old_url = $url; $tag = ""; if ($url =~ /#/) { $tag = $url; $tag =~ s/.*#//; $url =~ s/#.*//; } open(DATA, $fname) || die "Cannot open $fname"; $data = ""; while () { chomp; $data .= "$_ "; } close DATA; $tag =~ s/(\W)/\\$1/g; # Clean up metacharacters $data =~ s///g; # Remove comments # $data = decomment($data); # We really want to read decommented lines #print "Url $url gives: $data\n"; while ($data =~ /A[^>]+HREF\s*=\s*"([^"]*)"/i) { $newurl = $1; # print "Found a URL as $newurl in $url in $data\n"; # print "Found a URL as $newurl in $url\n"; $newurl = canon_url($newurl, $url); # print "translated to $newurl\n"; add_reference($newurl, $url) if ($newurl ne ""); $data =~ s/A[^>]+HREF\s*=\s*"([^"]*)"//i; } while ($data =~ /LINK[^>]+HREF\s*=\s*"([^"]*)"/i) { $newurl = $1; $newurl = canon_url($newurl, $url); add_reference($newurl, $url) if ($newurl ne ""); $data =~ s/LINK[^>]+HREF\s*=\s*"([^"]*)"//i; } while ($data =~ /IMG[^>]+SRC\s*=\s*"([^"]*)"/i) { $newurl = $1; # print "Found an image as $newurl\n"; $newurl = canon_url($newurl, $url); $newurl =~ s/http/X-img-src/; # print "translated to $newurl\n"; add_reference($newurl, $url) if ($newurl ne ""); $data =~ s/IMG[^>]+SRC\s*=\s*"([^"]*)"//i; if ($newurl =~ /construct.gif$/) { print "\tUnder construction: $url\n"; $Error = 1; } } $tag = "" if ($tag ne "" && $data =~ /A\s+NAME\s*=\s*"$tag"/i); if ($tag ne "") { $msg = "\t Missing name: $tag\n" . "\t From file: $url\n" . "\treferenced from: $Referenced{$old_url}\n"; print $msg; $Report .= $msg; $NErrors++; $Error = 1; } } ### ### read_exceptions ### ### Read a per user exception list for each user. ### This attempts to read a file for any user name on any machine. ### Luckily, most people don't have accounts on this machine. ### sub read_exceptions { my($url) = @_; my($content, @content); $content = xget($url); return if (!defined($content)); @content = split(/\n/, $content); $Exception{"FILE: $url"} = 1; for (@content) { chomp; s/\s+$//; next if (/^\s*$/ || /^#/); print qq|Exception "$_" added\n| if ($Args{'debug'}); $Exception{$_} = 1; s/http/X-img-src/; $Exception{$_} = 1; } } ### ### HTMLreadTimeout ### ### Read an HTML link. Timeout if not in an appropriate time window. ### sub HTMLreadTimeout { my($url, $tempfile) = @_; my($ret, $wakeup); $wakeup = 0; eval { $SIG{'ALRM'} = sub { $wakeup = 1; die "Cannot fetch $url" }; alarm($Args{'timeout'}); $ret = HTMLread2($url, $tempfile); }; alarm(0); if ($wakeup) { $FirstLine = "Timeout"; $ret = "BAD"; } return $ret; } ### ### HTMLread2 ### ### Read an HTML file. ### ### For awhile I was trying to guess about malformed URLs (e.g. no ### trailing slash). It was painful. It is probably better to look ### for the Location: header line and use that if given. However, even ### better is to use the Location information as part of the feedback ### to the user of this program. ### sub HTMLread2 { my($url, $tempfile) = @_; my($ret, $last, $url2); $FirstLine = ""; $ret = HTMLread($url, $tempfile); return "BAD" if ($ret eq "NEVER GETTING BETTER"); # return $ret if ($ret ne "" && $ret ne "BAD"); # $url .= "/" if ($url !~ m#/$#); # $ret = HTMLread($url, $tempfile); # return "BAD" if ($ret eq "NEVER GETTING BETTER"); # return $ret if ($ret ne "" && $ret ne "BAD"); return $ret; } ### ### HTMLread ### ### Read an HTML file. ### ### This only handles http protocol. ### ### Many servers have problems with the HEAD method. Always do ### GETs, and just return the head for offsite references. ### sub HTMLread { my($url, $fname) = @_; my($addrtype, $server, $pat, $af_inet, $ip, $length, $stream, $tcp, $proto, $aliases, $name, $http, $personal, $port, @bytes, $title, $intitle, $cmd, @addrs, $this, $that, $url2); # If this is the same base URL as last time, give the same # result as last time. The file contents should still be good. # This has an effect because the list of Unchecked URLs is sorted. # If we add stuff before our friend, then we loose. If afterwards # or no particular change, we win. This is an attempt to cache # files for URL fragments. The $fname is always the same for # a particular run of this program. Hmm. Because the first fragment # will enroll all of the new URLs, subsequent ones won't add anything # new. We should win alot. ($url2 = $url) =~ s/\#.*//; if (defined($Lasturl) && $url2 eq $Lasturl) { print "Use the last file fetched: $Lasturl\n"; return $Readstat; } $Lasturl = $url2; # Get the machine to talk to %Headers = (); open(NEWFILE, ">$fname") || die "Cannot open $fname"; print "\nFetch $url\n" if ($Args{'debug'}); $url2 = $url; $server = $url; $server =~ s#http://##i; $server =~ s#/.*##; $url2 =~ s#http://[^/]*##i; $url2 = "/" if ($url2 eq ""); # $cmd = "HEAD"; # if ($Args{'website'} eq $server) # { # if ($Args{'webroot'} =~ m#^(~[^/]+)#) # { # # If the root is a personal home page, don't expand out too far. # $personal = $1; # $personal =~ s/(\W)/\\$1/g; # $cmd = "GET" if ($url2 =~ /$personal/); # } # else # { # $cmd = "GET"; # } # } $cmd = "GET"; $cmd = "HEAD" unless ($url =~ /\Q$Args{'site'}/); print "Cmd is $cmd for $url\n" if ($Args{'debug'}); # Get the port number to use $port = 80; if ($server =~ /:/) { $port = $server; $port =~ s/.*://; $server =~ s/:.*//; } print "Server is $server, URL is $url2, port is $port\n" if ($Args{'debug'}); # Parameters $pat = 'S n C4 x8'; $af_inet = 2; $stream = 1; # Get magic numbers ($name, $aliases, $proto) = getprotobyname('tcp'); $tcp = $proto; # If the server is by number, use the number, otherwise look up the name if ($server =~ /^[\d\.]+$/) { @bytes = split(/\./, $server); } else { ($name, $aliases, $addrtype, $length, @addrs) = gethostbyname($server); $FirstLine = "Cannot resolve address $server"; return "BAD" unless $name; # Cannot find the address @bytes = unpack("C4", $addrs[0]); } $ip = join(".", @bytes); if (defined($ConnectFailed{$ip})) { # If we tried to connect once and failed, don't bother again. # It may be network problems, it may be a failed host. Let's # move on. close NEWFILE; $Report .= "\t Previous connect failed: $ip: $!\n"; $Report .= "\t Referencing URL: $url\n"; return "BAD"; } $this = pack($pat, $af_inet, 1492, 0, 0, 0, 0); $that = pack($pat, $af_inet, $port, @bytes); socket(HTTP, $af_inet, $stream, $tcp) || die "socket: $!\n"; setsockopt(HTTP, SOL_SOCKET, SO_REUSEADDR, pack("l", 1)) || die "setsockopt: $!"; if ($FirstTime) { bind(HTTP, $this) || die "bind: $!\n"; $FirstTime = 0; } if (!connect(HTTP, $that)) { $ip = join(".", @bytes); $ConnectFailed{$ip} = 1; print "Cannot connect to $ip: $!\n"; # if ($Args{'debug'}); close(HTTP); $FirstLine = "Cannot connect"; $Report .= "\tCannot connect to $ip: $!\n"; $Report .= "\tReferencing URL: $url\n"; return "BAD"; } select(HTTP); $| = 1; select(STDOUT); $| = 1; print "Connected to HTTP server at $server (",join('.', @bytes),").\n" if ($Args{'debug'}); $url2 =~ s#^/+#/#; $url2 =~ s/ /%20/g; print HTTP "GET $url2 HTTP/1.0\r\nHost: $server\r\n\r\n"; print "Command is: GET $url2 HTTP/1.0\nHost: $server\n" if ($Args{'debug'}); $intitle = 0; $title = ""; while () { last if (/\S/); } $Readstat = $fname; $_ = "cmd fails 666: $cmd $url2 HTTP/1.0\n" if (!defined($_)); print "First line is $_" if ($Args{'debug'}); $FirstLine = $_; $FirstLine =~ s/\r//g; chomp($FirstLine); $FirstLine = "(Address $bytes[0].$bytes[1].$bytes[2].$bytes[3] $FirstLine)"; if (/Moved/) { $Readstat = "BAD"; } elsif (/666/) { # The read has failed. This shows up on MS servers w/ HEAD command. # However, if you do the GET it'll probably work. Bleh. $Readstat = "OK"; # $Readstat = "NEVER GETTING BETTER"; } elsif (/302/) { $Readstat = "OK"; } elsif (/200/) { while () { s/\r\n$/\n/m; s/\r/\n/mg; last if (/^$/ && $cmd eq "HEAD"); # Simulate HEAD print NEWFILE "$_\n"; } } else { # Failure of some sort $Readstat = ""; } while () { s/[\r\n]//g; last if (/^$/); $Headers{$1} = $2 if (/^(.*?):\s*(.*)/); } close(HTTP); close NEWFILE; print "HTMLread returns \"$Readstat\"\n" if ($Args{'debug'}); return $Readstat; } ### ### decomment ### ### This is taken from tchrist@perl.com. ### sub decomment { s{ # up to a `>' }{ if ($1 || $3) { # this silliness for embedded comments in tags ""; } }gesx; # mutate into nada, nothing, and niente } ###### ###### Simple.pm ###### ###### The Simple.pm routines have a problem that opening a socket ###### destroys the timer. This is bad, as it means we can not timeout ###### the transaction. Bleh. So the code through the end of the ###### file is a horrid modification to Simple.pm with a timeout ###### parameter passed around. In addition, the documentation and ###### unused (and thus untested) routines are stripped. ###### use vars qw($VERSION $FULL_LWP $ua %loop_check); use HTTP::Status; ### ### xget ### ### All calls to get are routed through here to attempt to ### avoid timeout conditions. ### sub xget { my($url) = @_; my($contents, $timer); # print STDERR scalar(localtime()), " Go fetch $url\n"; $contents = get2($url, 10 * 60); # print STDERR scalar(localtime()), " Turn off alarm ", # ((defined($contents)) ? "good data" : "timeout"), "\n"; return $contents; } sub _init_ua { require LWP; require LWP::UserAgent; require HTTP::Status; require HTTP::Date; $ua = new LWP::UserAgent; # we create a global UserAgent object my $ver = $LWP::VERSION = $LWP::VERSION; # avoid warning $ua->agent("LWP::Simple/$LWP::VERSION"); $ua->env_proxy; } sub get2 { my($url, $timeout) = @_; %loop_check = (); return _get($url, $timeout); } sub _get { my($url, $timeout) = @_; my $ret; $VERSION = sprintf("%d.%02d", q$Revision: 1.8 $ =~ /(\d+)\.(\d+)/); $FULL_LWP++ if grep {lc($_) eq "http_proxy"} keys %ENV; if (!$FULL_LWP && $url =~ m,^http://([^/:]+)(?::(\d+))?(/\S*)?$,) { my $host = $1; my $port = $2 || 80; my $path = $3; $path = "/" unless defined($path); return _trivial_http_get($host, $port, $path, $timeout); } else { _init_ua() unless $ua; my $request = HTTP::Request->new(GET => $url); my $response = $ua->request($request); return $response->is_success ? $response->content : undef; } } sub _trivial_http_get { my($host, $port, $path, $timeout) = @_; #print "HOST=$host, PORT=$port, PATH=$path\n"; require IO::Socket; local($^W) = 0; my $sock = IO::Socket::INET->new(PeerAddr => $host, PeerPort => $port, Proto => 'tcp', Timeout => 60) || return; $sock->autoflush; my $netloc = $host; $netloc .= ":$port" if $port != 80; print $sock join("\015\012" => "GET $path HTTP/1.0", "Host: $netloc", "User-Agent: lwp-trivial/$VERSION", "", ""); my $buf = ""; my $n; my $timer; my $wakeup = 0; eval { $SIG{'ALRM'} = sub { $wakeup = 1; die "Cannot fetch $host:$port/$path" }; print STDERR "Set timer for read at $timeout\n"; alarm($timeout); 1 while $n = sysread($sock, $buf, 8*1024, length($buf)); }; $timer = alarm(0); print STDERR "Timer finished with $timer seconds\n"; # 1 while $n = sysread($sock, $buf, 8*1024, length($buf)); if ($wakeup) { print STDERR "Alarm clock fired\n"; return undef; } return undef unless defined($n); if ($buf =~ m,^HTTP/\d+\.\d+\s+(\d+)[^\012]*\012,) { my $code = $1; #print "CODE=$code\n$buf\n"; if ($code =~ /^30[1237]/ && $buf =~ /\012Location:\s*(\S+)/) { # redirect my $url = $1; return undef if $loop_check{$url}++; return _get($url, $timeout); } return undef unless $code =~ /^2/; $buf =~ s/.+?\015?\012\015?\012//s; # zap header } return $buf; }