#!/usr/local/bin/perl # # dtsort.pl # A. de la Fuente 13-Jan-1995 # Translated from dtsort.awk # eval "exec /usr/local/bin/perl -S $0 $*" if $running_under_some_shell; # this emulates #! processing on NIH machines. # (remove #! line above if indigestible) eval '$'.$1.'$2;' while $ARGV[0] =~ /^([A-Za-z_]+=)(.*)/ && shift; # process any FOO=bar switche $countrycodefile = "/vendor/webhead/WWW/docs/WWWVL/Astronomy/astroweb/country-codes"; #!/usr/local/bin/gawk -f # # Use: dtsort [-v tail=filename1.html] # [-v keys=filename2.html] # [-v sort={protoname|topzone|name}] # sorted_file.html # # Program dtsort sorts
-style resource records into "JANET" or # "protocol"order. It also applies certain transformation rules, such # as changing all SGML/HTML codes to uppercase and suppressing # trailing blanks. In addition, it deletes exact duplicate resource # records, and concatenates similar records by using
..
to # indent. # # Optional file 'tail' is nserted into the 'sorted_file' just before # the final .
lines can be appended with this # technique. Optional file 'keys' will contain a listing of the sort # keys (the JANET-style values). # # D.Wells, NRAO-CV, 11Feb-10Mar94,Jly94. # todo: # *change news:/janet logic so news will sort properly. # -------------------------------------------------------------------- $[ = 1; # set array base to 1 $, = ' '; # set output field separator $\ = "\n"; # set output record separator $tail_file = $tail; if ($sort eq '') { $sort = 'topzone'; # ; } &to_lower(''); # Copy input header lines up to first blank, then append comment: while (($_ = &Getline0(),$getline_ok) > 0) { if (length > 0) { print $_; } else { last; } } $command = "echo \"" . sprintf('%12s', "'" . $sort . "'") . ' sort (`whoami`' . " on `hostname`. at `date`)\""; $string = &Getline3($command, '|'); delete $opened{$command} && close($command); printf "\n", $string; # print comment to stdout &Pick('|', 'cat 1>&2') && (print $fh $string); # also to stderr (shows on terminal) printf (("\n
\n")); # if ($kf = ($Keys ne '')) { $key_file = $Keys; &Pick('>', $key_file) && (print $fh ''); &Pick('>>', $key_file) && (print $fh 'Sort Keys for Master List'); &Pick('>>', $key_file) && (printf $fh "\n", $string); &Pick('>>', $key_file) && (print $fh ''); &Pick('>>', $key_file) && (printf $fh "Sort-keys are intended to be in '%s' order:

\n", $sort); &Pick('>>', $key_file) && (print $fh "
\n"); } # -------------------------------------------------------------------- # Canonical resource records begin with blank line followed by
. # Therefore, we skip all input lines until we get a left-justified
, # then we concatenate the resource record lines, up to the next blank line: line: while (<>) { chop; # strip record separator @Fld = split(' ', $_, 9999); if ($_ =~ /^[<][Dd][Tt][>]/) { $r = $_ . '||'; while (($_ = &Getline0(),$getline_ok)) { if (length > 0) { $r = $r . $_ . '||'; } else { last; # The lines of this resource are now concatenated into one string, # separated by the code '||': ; } } $n++; $resource{$n} = $r; } # -------------------------------------------------------------------- # When reach EOF, edit each resource into canonical form, use keys[] to # sort the resource records, then delete duplicates and merge similar # resources, and finally print the resources to stdout: # ------------------------------------------------------------------ # Replace 'pattern' in resource[l] with 'string' and set 'flag': # ------------------------------------------------------------------ } &Pick('|', 'cat 1>&2') && (printf $fh "%4d resource records read.\n", $n); # Edit resource records into canonical form: $num_edit = 0; for ($l = 1; $l <= $n; $l++) { #??? $flag = 0; &replace_pattern(" +\\|\\|", '||'); &replace_pattern("\\|\\| +", '||'); # temporary rule? &replace_pattern("\"file://", "\"ftp://"); &replace_pattern(' -->', ' -->'); &replace_pattern('\\|\\|" && if ($resource{$a{$l}} =~ // && ($RLENGTH = length($&), $RSTART = length($`)+1)) { # Test for special "???" (somebody) case: if ((substr($resource{$a{$l}}, 1, $RSTART - 1) . substr($resource{$a{$l}}, $RSTART + $RLENGTH, 999999)) eq $resource{$a{$l + 1}}) { #??? $resource{$a{$l + 1}} = $resource{$a{$l}}; $num_merge++; $num_somebody++; $somebody_flag = 1; } } if (!$somebody_flag) { # append resource[a[l+1]] to [l],
makes indentation: if ($resource{$a{$l}} =~ "
\\|\\|\$" && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $resource{$a{$l + 1}} = substr($resource{$a{$l}}, 1, $RSTART - 1) . $resource{$a{$l + 1}} . '
||'; $num_multiple++; } else { $resource{$a{$l + 1}} = $resource{$a{$l}} . '
The following resources are similar ' . '(same sort-key, different text):||' . $resource{$a{$l + 1}} . '
||'; } $num_merge++; } } else { $a{++$i} = $a{$l}; } } $a{++$i} = $a{$n}; $n = $i; if ($num_merge > 0) { &Pick('|', 'cat 1>&2') && (printf $fh "%4d similar resource records merged. (n=%d) %d multiple, %dsomebody\n", $num_merge, $n, $num_multiple, $num_somebody); # # following table gives comment strings for selected top-level domains: ; } #read in table of ISO codes and country names open (blah, "<$countrycodefile") || die "Can't open $countrycodefile"; while () { chop; local($iso,$name)=split(' '); $iso =~ y/A-Z/a-z/; $tld{"$iso"}="$name"; } close blah; # $tld{'1_http'} = 'World Wide Web'; $tld{'2_gopher'} = 'Gopher'; $tld{'3_wais'} = 'Wide Area Information Services'; $tld{'4_rlogin'} = $tld{'4_telnet'} = 'Remote Login'; $tld{'5_file'} = 'Anonymous-FTP [File Transfer Protocol]'; $tld{'6_news'} = 'USEnet_Newsgroups'; $tld{'7_mailto'} = 'Email exploders'; # # print the sorted resource records: $last_top = ''; for ($l = 1; $l <= $n; $l++) { #??? $this_key = $Keys{$a{$l}}; $top = substr($this_key, 1, index($this_key, '.') - 1); if ($top ne $last_top) { #??? if ($topcom = (defined $tld{$top})) { $topcom = $tld{$top}; } else { $topcom = ''; } $text = '2'; $text = "
\n
' . $top . ' (' . $topcom . ')
\n
\n\n"; print $text; if ($kf) { &Pick('>>', $key_file) && (print $fh $text); } $last_top = $top; } $this_resource = $resource{$a{$l}}; while (length($this_resource) > 0) { $p = index($this_resource, '||'); # find end of current line printf "%s\n", substr($this_resource, 1, $p - 1); # print it $this_resource = substr($this_resource, $p + 2, 999999); # and delete it. ; } printf (("\n")); # if ($kf) { &Pick('>>', $key_file) && (printf $fh "
\n\n%s\n
\n\n", $urls{$a{$l}}, $Keys{$a{$l}}); } } print '
'; if ($tail_file ne '') { $num_tail = 0; while (($_ = &Getline2($tail_file),$getline_ok) > 0) { print $_; $num_tail++; } delete $opened{$tail_file} && close($tail_file); # printf ("%4d lines copied from %s.\n", num_tail, tail_file) | "cat 1>&2"; ; } print ''; print ''; &Pick('|', 'cat 1>&2') && (printf $fh "%4d resource records written.\n", $n); # if ($kf) { &Pick('>>', $key_file) && (print $fh ''); delete $opened{$key_file} && close($key_file); } exit $ExitValue; sub replace_pattern { local($pattern, $string) = @_; $num_hits = 0; while ($resource{$l} =~ $pattern && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $matched = substr($resource{$l}, $RSTART, $RLENGTH); $hit{$matched}++; $num_hits++; if ($num_hits > 30) { &Pick('|', 'cat 1>&2') && (printf $fh "TOO MANY HITS? l=%3d,RS=%3d,RL=%2d,pat=|>%s<| matches |>%s<|\n", $l, $RSTART, $RLENGTH, $pattern, $matched); # exit(13); ; } $hit{substr($resource{$l}, $RSTART, $RLENGTH)}++; $resource{$l} = substr($resource{$l}, 1, $RSTART - 1) . $string . substr($resource{$l}, $RSTART + $RLENGTH, 999999); $flag = 1; } } sub to_lower { local($string) = @_; if ($string eq '') { #??? $lc{'A'} = 'a'; $lc{'B'} = 'b'; $lc{'C'} = 'c'; $lc{'D'} = 'd'; $lc{'E'} = 'e'; $lc{'F'} = 'f'; $lc{'G'} = 'g'; $lc{'H'} = 'h'; $lc{'I'} = 'i'; $lc{'J'} = 'j'; $lc{'K'} = 'k'; $lc{'L'} = 'l'; $lc{'M'} = 'm'; $lc{'N'} = 'n'; $lc{'O'} = 'o'; $lc{'P'} = 'p'; $lc{'Q'} = 'q'; $lc{'R'} = 'r'; $lc{'S'} = 's'; $lc{'T'} = 't'; $lc{'U'} = 'u'; $lc{'V'} = 'v'; $lc{'W'} = 'w'; $lc{'X'} = 'x'; $lc{'Y'} = 'y'; $lc{'Z'} = 'z'; } else { $lower_string = ''; for ($lci = 1; $lci <= length($string); $lci++) { $cc = substr($string, $lci, 1); if (defined $lc{$cc}) { $cc = $lc{$cc}; } $lower_string = $lower_string . $cc; } } ($lower_string); } sub Getline0 { if ($getline_ok = (($_ = <>) ne '')) { chop; # strip record separator @Fld = split(' ', $_, 9999); } $_; } sub Getline2 { &Pick('',@_); if ($getline_ok = (($_ = <$fh>) ne '')) { chop; # strip record separator @Fld = split(' ', $_, 9999); } $_; } sub Getline3 { &Pick('',@_); local($_); if ($getline_ok = (($_ = <$fh>) ne '')) { chop; # strip record separator } $_; } sub Pick { local($mode,$name,$pipe) = @_; $fh = $name; open($name,$mode.$name.$pipe) unless $opened{$name}++; }