#!/usr/local/bin/perl -s # # htget --- mirror WWW pages and directories from an HTTP server # # WARNING: apparently this does not work under Solaris (yet) # # Oscar Nierstrasz 24/3/94 oscar@cui.unige.ch # # This script and friends can be found at: # http://cui_www.unige.ch/ftp/PUBLIC/oscar/scripts/README.html # or ftp: cui.unige.ch:/PUBLIC/oscar/scripts/ # # Retrieves HTML pages, creating local copies in the current directory. # With the -abs option, relative URLs are converted to absolute URLs. # # With the -r option, all reachable URLs with the same directory prefix # are retrieved. All relative URLs are converted to absolute URLs, # *except* those with the same prefix, which are all converted # to relative URLs (overrides -abs). This is intended for mirroring # WWW directories: point at some "root" page or directory, and htget # will recursively retrieve all reachable pages. # # Warning: only http, ftp and gopher URLs are currently understood. # This is an evolution of `hget', but still only understands HTTP. # See urlget (separate script) for correct handling of gopher protocol. # # To install this, you will also need url.pl and ftplib.pl # This script and dependent packages can be retrieved by ftp: # cui.unige.ch:/PUBLIC/oscar/scripts/* # Or see: http://cui_www.unige.ch/ftp/PUBLIC/oscar/scripts/README.html # # See also Martijn Koster's htmirror: # http://web.nexor.co.uk/mirror/mirror.html # # 25/3/94 -- added -s option to send output to stdout # 28/3/94 -- made HTTP 1.0 the default # Change this to wherever you want to install url.pl and friends unshift(@INC,"/usr/local/lib/inet-perl"); require "url.pl"; require "html.pl"; die "Usage: htget [-abs] [-r(ecursive)] [-s(tdout)] ...\n" unless $#ARGV >= 0; $version = "HTTP 1.0"; # default if ($old) { undef($version); } # OLD servers select(STDERR); $| = 1; select(STDOUT); foreach $url (@ARGV) { ($type,$host,$port,$path,$request) = &html'parse(undef,undef,undef,undef,$url); $prefix = "http://$host:$port$path"; $url = "http://$host:$port$request"; push(@toget,$url); # list of URLs to retrieve &htget; } # get all URLs in @toget sub htget { while ($#toget >= $[) { $url = shift @toget; if ($retrieved{$url}) { # print STDERR "htget: already retrieved $url\n"; next; } $retrieved{$url} = 1; # print STDERR "htget: GOT $url\n"; if ($page = &url'get($url,$version)) { if ($r) { # make all relative URLs absolute $page = &html'abs($url,$page); # Make sure ports are present in absolute URLs with this prefix: $page =~ s|http://$host$path|$prefix|g; # extract all URLs with the required prefix foreach $ref (grep(/^$prefix/,&html'hrefs($page))) { push(@toget,$ref) unless $retrieved{$ref}; } # make all URLs with common prefix relative # (so that the mirrored pages refer to each other, # and not to the original pages!) ($thisprefix = $url) =~ s|[^/]*$||; $page =~ s|$thisprefix||g; } elsif ($abs) { $page = &html'abs($url,$page); } if ($page) { &makepage($url,$page); } else { print STDERR "htget: couldn't retrieve $url -- $!\n"; } } else { warn "$!\n"; } } } # mirror a retrieved page sub makepage { ($url,$page) = @_; # STDOUT, so don't create page: if ($s) { print $page; return; } # extract the file name ($file = $url) =~ s|^$prefix||; # if empty, assume index.html if ($file =~ /^$/) { $file = "index.html"; } # If a directory, assume index.html $file =~ s|/$|$&index.html|; # Make all intermediate directories &mkdir($file); # If a directory already exists with this path, # assume index.html (this will occur if the source pages # do not consistently put a trailing "/" on directories if (-d $file) { $file .= "/index.html"; } if (open(PAGE,">$file")) { print PAGE $page; close PAGE; print STDERR "htget: created $file (", length($page), " bytes)\n"; } else { print STDERR "htget: can't write $url to $file -- $!\n"; } } # Make all intermediate directories needed for a file sub mkdir { local($file) = @_; local(@dirs) = split("/",$file); pop @dirs; local($path) = "."; foreach $dir (@dirs) { $path .= "/$dir"; # only make if it isn't already there unless (-d $path) { unless (mkdir($path,0755)) { if (-f $path) { # Hm, a file already exists with that name. # Someone must have forgotten a trailing blank # on a directory, so let's move the file down # one level to index.html $tmp = "$path$$"; link($path,$tmp); unlink($path); mkdir($path,0755); link($tmp,"$path/index.html"); unlink($tmp); print STDERR "htget: moved $path to $path/index.html\n"; } else { print STDERR "htget: can't create $path -- $!\n";} } } } } __END__