#!/usr/bin/perl -w # # spider.web # # Walk a web site looking for image files and build a csv file. # # Some of the examples generated below may contain pictures # inappropriate to some viewers. Take a look at the archive # under question before using the example blindly: # spider.web http://ftp.sunet.se/pub/pictures/art/M.C.Escher/ # spider.web http://ftp.sunet.se/pub/pictures/art/ # Note that the trailing slash is quite important in these examples, # as it is a shortcut to the index.html or equivalent. # use strict; use LWP::Simple; use vars qw($Count %Csv $Debug $New @URLStack %VisitedImg %VisitedHtml); $Debug = 0; ### ### Main program ### my($csv, @fields, $url); $Count = $New = 0; $csv = shift(@ARGV); if (!defined($csv)) { print "Usage: spider.web file.csv url...\n"; exit 1; } if (open(CSV, "<$csv")) { while () { chomp; @fields = split(/,/, $_); $Csv{$fields[0]} = $fields[1]; } close CSV; } open(CSV, ">$csv.new") || die "Cannot open $csv.new: $!"; for $url (@ARGV) { %VisitedImg = (); %VisitedHtml = (); @URLStack = (); Spider($url); } close CSV; rename($csv, "$csv.old"); rename("$csv.new", $csv); print "$Count entries, $New new entries\n"; ### ### Spider ### ### Look across a site from a base URL. Don't enroll any web pages ### which are outside of the root. ### ### This only parses a simplistic flavor of references. ### The HTML::Parse is the right way of doing this. ### sub Spider { my($baseurl) = @_; my($data, $home, $item, @item, $root, $rootlen, $site, $size, $url); ($root = $baseurl) =~ s#/[^/]*$##; $rootlen = length($root); push(@URLStack, $baseurl); while (defined($URLStack[0])) { $url = pop @URLStack; print "*** Process $url\n" if ($Debug); ($site = $url) =~ s#^([a-z]*://[^/]*).*#$1#; ($home = $url) =~ s#/[^/]*$##; $data = get($url); if (!defined($data) || length($data) == 0) { print STDERR "Cannot fetch $url\n"; next; } $data =~ s/[\r\n]/ /sg; # Find all of the HREF tags $data =~ s/TARGET="body"//g; @item = split(/]*=\s*['"]?/i, $data); shift @item; for $item (@item) { $item =~ s/\s*>.*//; $item =~ s/['"].*//; # $item =~ s/['"]$//; print "*** href is $item\n" if ($Debug); if ($item =~ /^([a-z]*):/) { next unless ($1 eq "http"); } elsif ($item =~ m#^/#) { $item = "$site$item"; } else { $item = "$home/$item"; } $item =~ s#/+#/#g; $item =~ s#:/#://#; while ($item =~ m#/\.\./#) { print "*** Reduce $item to " if ($Debug); $item =~ s#/[^/]*/\.\./#/#; print "$item\n" if ($Debug); } print "*** canon $item\n" if ($Debug); next unless (substr($item, 0, $rootlen) eq $root || $item =~ /192.41.13.240/); print "*** Accept $item\n" if ($Debug); # If this is an html file push it onto the work list. # If this is an image file, push it onto the csv list. if ($item =~ /\.html?$/i || $item =~ m#/$#) { if (!defined($VisitedHtml{$item})) { print "*** push work $item\n" if ($Debug); push(@URLStack, $item); $VisitedHtml{$item} = 1; } } elsif ($item =~ /\.(gif|jpg|jpeg)$/i && $item !~ /\.(m|t)\.(gif|jpg)$/) { if (!defined($VisitedImg{$item})) { print "*** push image $item\n" if ($Debug); $size = $Csv{$item} || 0; $New++ if (!defined($Csv{$item})); $Count++; print CSV "$item,$size,0,$url\n"; $VisitedImg{$item} = 1; } } } # Find all of the IMG tags @item = split(/]*=\s*['"]?/i, $data); shift @item; for $item (@item) { $item =~ s/\s*>.*//; $item =~ s/['"].*//; $item =~ s/\s*alt\s*=.*//i; $item =~ s/\s*border\s*=.*//i; $item =~ s/['"]$//; print "*** img is $item\n" if ($Debug); if ($item =~ /^([a-z]*):/) { next unless ($1 eq "http"); } elsif ($item =~ m#^/#) { $item = "$site$item"; } else { $item = "$home/$item"; } print "*** canon $item\n" if ($Debug); next unless (substr($item, 0, $rootlen) eq $root || $item =~ /192.41.13.240/); print "*** Accept $item\n" if ($Debug); $item =~ s#/+#/#g; $item =~ s#:/#://#; while ($item =~ m#/\.\./#) { print "*** Reduce $item to " if ($Debug); $item =~ s#/[^/]*/\.\./#/#; print "$item\n" if ($Debug); } # If this is an image file, push it onto the csv list. if ($item =~ /\.(gif|jpg|jpeg)$/i && $item !~ /\.(m|t)\.(gif|jpg)$/) { if (!defined($VisitedImg{$item})) { $size = $Csv{$item} || 0; $New++ if (!defined($Csv{$item})); $Count++; print CSV "$item,$size,0,$url\n"; $VisitedImg{$item} = 1; } } } } }