#! /usr/local/bin/perl # # htgrep.pl --- perl package to query HTML files # # Author: Oscar Nierstrasz (oscar@cui.unige.ch) 11/5/94 # # This package and friends can be found at: # http://cui_www.unige.ch/ftp/PUBLIC/oscar/scripts/README.html # or ftp: cui.unige.ch:/PUBLIC/oscar/scripts/ # # See also: htgrep.html in the same location # # This package is called by `htgrep', `parscan' and others. # # Package to query a file of HTML paragraphs. Paragraphs can be # arbitrary, standalone blocks of HTML text, separated by blank lines. # To query a file, use the URL: # # http:///cgi-bin/htgrep/file=?query= # # where http:/// is the file's real URL. If a header file # .hdr exists, htgrep will print that instead of the default # header. In addition, if .qry exists, it will be used whenever a # non-empty query is given. (Normally .hdr will be a cover page # with introductory information, whereas .qry will only contain the # title and main headline.) # # The header files may also be specified with the tags hdr= and # qry=. The header files are assumed to be in the same directory # as the file to search. Note that all tags may appear either # as part of the document URL or as the query string, i.e.: # # http:/cgi-bin/htgrep/file=&query= # http:/cgi-bin/htgrep/file=?query= # http:/cgi-bin/htgrep?file=&query= # # are all equivalent. # # If the file contents are not standalone HTML blocks but, for example, # list items or pre-formatted text, htgrep can be instructed to bracket # the results of the search with
 and 
,
and
, #
    and
or
    and
. Use the tag: style=pre, etc. # # The tag grab=yes will cause htgrep to search for URLs and ftp pointers # and convert them into hypertext links. This is probably most interesting # in combination with the tag style=pre to query plain text files. # An example is the Free Compilers List: # http://cui_www.unige.ch/cgi-bin/htgrep/file=OSG/Langlist/free&style=pre&grab=yes # # The same package can be used to query a database of refer(1) style # bibliography entries. Use the tag refer=plain. # See, for example, the OO Bibliography Database at CUI: # http://cui_www.unige.ch/bibrefs # # The tag refer=abstract is used internally by htgrep and is automatically # generated when a bibliography entry contains an abstract (%X field). # A link to a new call to htgrep is then generated, which will cause # the abstract for a given entry to be displayed. # Links to ftpable papers are also generated, if the refer entry # contains a line of the form: # # %% ftp: : # # If the tag ftpstyle=dir is used, the link will be to the containing # directory rather than to the file itself (to facilitate exploration). # # Normally a maximum of 250 records will be retrieved. This can # be controlled with the tag max=. # # In some cases, this package is not called by htgrep but by another # script that is responsible for setting the tags. You can inform # the package to use a different URL when generating new requests # by using the tag htgrep=, for example htgrep=/w3catalog. # # Finally, the tag linemode=yes causes htgrep to retrieve refer records on a # line-by-line basis, if fields are separated by ^A instead of a "\n". # (This is mainly interesting for the CUI library database.) # # TAGS # # file -- file to search # isindex -- query string # hdr -- header file (to preceed output) # qry -- query file (alternative header for non-empty query) # style -- [none/pre/ol/ul/dl] format of records # max -- mac records to return (default 250) # grab -- [no/yes] convert URLs to hypertext (in plain text) # refer -- [plain/abstract] format # ftpstyle -- [file/dir] make link to ftp file or dir (for refer) # linemode -- [no/yes] each record is a single line # htgrep -- alternative URL to use for self-calls # # TO DO: # - should use POST method??? require "html.pl"; require "bib.pl"; # --------------------------------------------------------------------------- package htgrep; $v = 'htgrep v1.0'; # 11/5/93 -- converted `parscan' into cgi-bin script # --------------------------------------------------------------------------- # Configure these: $www_home = "/user/u2/ncsa_httpd/htdocs"; $htgrep = "/cgi-bin/htgrep"; $maintainer = "webmaster@cui.unige.ch"; # --------------------------------------------------------------------------- $perlexp = "http://www.cis.ohio-state.edu:85/info/perl.info,Regular%20Expressions"; $ftp = ''; $sig = "This file was generated by $ftp$v.\n

\n"; $maxcount = 250; # MAX records to retrieve # Alternative names for $tags{'style'}: %styletags = ( "HTML paragraphs", "NONE", "Numbered list", "OL", "Bullet list", "UL", "Description list", "DL", "Plain ascii text", "PRE" ); # --------------------------------------------------------------------------- # %tags must be initialized before this is called: sub doit { print "Content-type: text/plain\n\n"; chdir($www_home) || &error("chdir", "Can't chdir to $www_home"); &checktags(%tags); if ($query) { &query; } else { &noquery; } } # generic routine to set tags from $ENV{'PATH_INFO'} or $ENV{'QUERY_STRING'} # optionally called by the main script to initialize %tags sub settags { local($_) = @_; s|^/||; if (/=/) { @terms = split('&'); foreach $term (@terms) { ($tag,$val) = split('=',$term,2); $val =~ s/\+/ /g; $val =~ s/%([\da-f]{1,2})/pack(C,hex($1))/eig; # may override previous value $tags{$tag} = $val; # print "\$tag { $tag } = $val\n"; } } # No '=', so the whole string must be a query: else { $tags{'isindex'} = $_; } } # check %tags and initialize some variables sub checktags { $file = $tags{'file'}; ($file eq "") && &error('bad request', "Missing filename."); (($file =~ m#\.+/#) || ($file =~ m#/\.+$#)) && &error('bad request', "No backward directory references permitted: $file"); $file =~ s|^/||; # relative to www_home only! (-e $file) || &error('not found', "Can't find $file\n"); (-d $file) && ($file = "$file/index.html"); (-f $file) || &error('not found', "Can't find $file\n"); # $query = $tags{'isindex'}; # ($tags{'max'} =~ /^\d+$/) && ($maxcount = $tags{'max'}); # # $pre and $post are printed before and after the query results # They can be given directly, or retrieved from %styletags. # Should be pre, dl, ol or ul. if (($pretag = $styletags{($fmttag = $tags{'style'})}) =~ /./) { $pre = "<$pretag>"; $post = ""; } elsif ($fmttag =~ /./) { $pre = "<$fmttag>"; $post = ""; } else { $pre = $post = ""; } if ($pre =~ /none/i) { $pre = $post = ""; } # # set the routine that will do the searching: if ($tags{'refer'} =~ /plain/i) { $mode = "&refmode"; } elsif ($tags{'refer'} =~ /abstract/i) { $mode = "&absmode"; $isabstract = 1; } else { $mode = "&parmode"; } } # do the query sub query { &safeopen(FILE,$file) || &error("couldn't open file", "file\" $!"); &put_qry; # print the header print "Result of search for \"$query\":

\n"; # # normally records are separated by blank lines # if linemode is set, there is one record per line if ($tags{'linemode'} =~ /yes/i) { $/ = "\n"; } else { $/ = ""; } # # do the query and pick up the results: eval "$mode(\$query)" || print "$post\nGarbled search pattern: $@\n

\n", "Be sure to use a valid Perl regular expression.

\n"; $@ = undef; # clean up error msg if ($count == $maxcount) { print "Too many matching records (> $maxcount)!\n", "Try a more restrictive pattern.

"; } elsif ($count == 0) { print "No matching entries found.

\n"; } print "


\n"; print "$sig\n"; print ""; close(FILE); } sub put_qry { # look for a header file: ($base = $file) =~ s/\.html$//; ($dir = $file) =~ s|[^/]*$||; (-f ($hdr = "$dir/$tags{'qry'}")) || (-f ($hdr = "$tags{'qry'}")) || (-f ($hdr = "$base.qry")) || (-f ($hdr = "$base.query")) || (-f ($hdr = "$dir/$tags{'hdr'}")) || (-f ($hdr = $tags{'hdr'})) || (-f ($hdr = "$base.hdr")); if (&safeopen(HDR,$hdr)) { $/ = undef; # gobble the whole input print ; } else { &makeform; } } sub noquery { # look for a header file: ($base = $file) =~ s/\.html$//; ($dir = $file) =~ s|[^/]*$||; (-f ($hdr = "$dir/$tags{'hdr'}")) || (-f ($hdr = $tags{'hdr'})) || (-f ($hdr = "$base.hdr")); if (&safeopen(HDR,$hdr)) { $/ = undef; # gobble the whole input print ; } else { &makeform; } print "$sig\n"; print ""; } # # construct a new call to htgrep sub makeform { unless (($url = $tags{'htgrep'}) =~ /./) { $url = "$htgrep/file=$file" . "&style=$tags{'style'}" . "&refer=$tags{'refer'}" . "&ftpstyle=$tags{'ftpstyle'}" . "&grab=$tags{'grab'}" . "&linemode=$tags{'linemode'}" . "&max=$maxcount"; $url =~ s/ /+/g; } # SHOULD ALSO PACK SPECIAL CHARS? # no header file, so construct one print "\nHtgrep of $file\n"; print "\n\n

Htgrep of $file

\n"; print "
\n"; print "Provide a ", "Perl regular expression", "as a search pattern.\n"; print "\n", "\n", "<\/FORM>\n"; print "
\n"; } # not yet used ... sub makeindex { print "\nScan of $file\n"; print "\n\n

Scan of $file

\n"; unless ($query || $isabstract) { print "Provide a ", "Perl regular expression", "as a search pattern.

\n"; } } # Ripped off from Plexus sub error { local($kind, $msg) = @_; print <Htgrep error: $kind

Htgrep error: $kind

$msg

EOM exit; } # Ripped off from Plexus # This should be adapted to allow restrictions to be specified! sub safeopen { local($fh, $_) = @_; s#^\s#./$&#; # protect leading spaces (m#/\.+/# || m#/\.+$#) && &error('bad request', "No backward directory references permitted: $_"); open($fh, "< $_\0"); } sub parmode { local($query) = @_; print $pre; $count = 0; while () { /$query/oi || next; # # Escape special chars in pre-formatted text: # if ($pre =~ /

/i) { &html'esc; }
		#
		# grab URLs and convert them to hyperlinks:
		#
		if ($tags{'grab'} =~ /yes/i) { &html'href; }
		print;
		last if (++$count == $maxcount);
	}
	print $post;
}

sub refmode {
	local($query) = @_;
	&bib'html_init;
	$count = 0;
	unless ($pre =~ /./) {
		$pre = "
    \n"; $post = "
\n"; # default style } print $pre; open(STDERR,">/dev/null"); # ignore errors from bib'getref while () { /$query/oi || next; tr//\n/; # expand ^A to CR &accent'html; &bib'getref; print "\n
  • $bib'ref\n"; if ($bib'abstract ne "") { $what = "abstract"; if ($bib'ftp ne "") { $what .= "+ftp"; } print "$what\n"; } elsif ($bib'ftp ne "") { # point to the directory rather than the file: if ($tags{'ftpstyle'} eq "dir") { $bib'url =~ s|[^/]*$||; $bib'ftp =~ s|([^/]*)$||; $ftpfile = $1; } print "ftp: $bib'ftp$ftpfile\n"; } last if (++$count == $maxcount); } print $post; } sub absmode { local($query) = @_; &bib'html_init; open(STDERR,">/dev/null"); # ignore errors from bib'getref while () { /$query/oi || next; &accent'html; &bib'getref; print "
    \n"; print "

    $bib'title

    \n"; print "$bib'ref

    \n"; if ($bib'abstract ne "") { print "

    Abstract

    \n$bib'abstract

    \n"; } if ($bib'ftp ne "") { # point to the directory rather than the file: if ($tags{'ftpstyle'} eq "dir") { $bib'url =~ s|[^/]*$||; $bib'ftp =~ s|([^/]*)$||; $ftpfile = $1; } print "ftp: ", "$bib'ftp$ftpfile

    \n"; # print "ftp: $bib'ftp

    \n"; } last if (++$count == $maxcount); } 1; } # This routine provides backways compatibility for "parscan". # Requests to URL /parscan/path?query should be mapped to # a cgi-bin script `parscan' which will call &htgrep'parscan($path,$query) sub parscan { local($file,$query) = @_; $file =~ s|^/||; ($file =~ s|^(-[^/]*)/||) && ($flags = "$1-"); $tags{'file'} = $file; ($flags =~ /-dl-/) && ($tags{'style'} = "dl"); ($flags =~ /-ol-/) && ($tags{'style'} = "ol"); ($flags =~ /-ul-/) && ($tags{'style'} = "ul"); ($flags =~ /-pre-/) && ($tags{'style'} = "pre"); ($flags =~ /-a-/) && ($tags{'refer'} = "abstract"); ($flags =~ /-l-/) && ($tags{'linemode'} = "yes"); ($flags =~ /-url-/) && ($tags{'grab'} = "yes"); $tags{'isindex'} = $query; &doit; }