summaryrefslogtreecommitdiff
path: root/docs/docbook/scripts/collateindex.pl
diff options
context:
space:
mode:
Diffstat (limited to 'docs/docbook/scripts/collateindex.pl')
-rw-r--r--docs/docbook/scripts/collateindex.pl595
1 files changed, 595 insertions, 0 deletions
diff --git a/docs/docbook/scripts/collateindex.pl b/docs/docbook/scripts/collateindex.pl
new file mode 100644
index 0000000000..fd757edb32
--- /dev/null
+++ b/docs/docbook/scripts/collateindex.pl
@@ -0,0 +1,595 @@
+# -*- Perl -*-
+#
+
+use Getopt::Std;
+
+$usage = "Usage: $0 <opts> file
+Where <opts> are:
+ -p Link to points in the document. The default is to link
+ to the closest containing section.
+ -g Group terms with IndexDiv based on the first letter
+ of the term (or its sortas attribute).
+ (This probably doesn't handle i10n particularly well)
+ -s name Name the IndexDiv that contains symbols. The default
+ is 'Symbols'. Meaningless if -g is not used.
+ -t name Title for the index.
+ -P file Read a preamble from file. The content of file will
+ be inserted before the <index> tag.
+ -i id The ID for the <index> tag.
+ -o file Output to file. Defaults to stdout.
+ -S scope Scope of the index, must be 'all', 'local', or 'global'.
+ If unspecified, 'all' is assumed.
+ -I scope The implied scope, must be 'all', 'local', or 'global'.
+ IndexTerms which do not specify a scope will have the
+ implied scope. If unspecified, 'all' is assumed.
+ -x Make a SetIndex.
+ -f Force the output file to be written, even if it appears
+ to have been edited by hand.
+ -N New index (generates an empty index file).
+ file The file containing index data generated by Jade
+ with the DocBook HTML Stylesheet.\n";
+
+die $usage if ! getopts('Dfgi:NpP:s:o:S:I:t:x');
+
+$linkpoints = $opt_p;
+$lettergroups = $opt_g;
+$symbolsname = $opt_s || "Symbols";
+$title = $opt_t;
+$preamble = $opt_P;
+$outfile = $opt_o || '-';
+$indexid = $opt_i;
+$scope = uc($opt_S) || 'ALL';
+$impliedscope = uc($opt_I) || 'ALL';
+$setindex = $opt_x;
+$forceoutput = $opt_f;
+$newindex = $opt_N;
+$debug = $opt_D;
+
+$indextag = $setindex ? 'setindex' : 'index';
+
+if ($newindex) {
+ safe_open(*OUT, $outfile);
+ if ($indexid) {
+ print OUT "<$indextag id='$indexid'>\n\n";
+ } else {
+ print OUT "<$indextag>\n\n";
+ }
+
+ print OUT "<!-- This file was produced by collateindex.pl. -->\n";
+ print OUT "<!-- Remove this comment if you edit this file by hand! -->\n";
+
+ print OUT "</$indextag>\n";
+ exit 0;
+}
+
+$dat = shift @ARGV || die $usage;
+die "$0: cannot find $dat.\n" if ! -f $dat;
+
+%legal_scopes = ('ALL' => 1, 'LOCAL' => 1, 'GLOBAL' => 1);
+if ($scope && !$legal_scopes{$scope}) {
+ die "Invalid scope.\n$usage\n";
+}
+if ($impliedscope && !$legal_scopes{$impliedscope}) {
+ die "Invalid implied scope.\n$usage\n";
+}
+
+@term = ();
+%id = ();
+
+$termcount = 0;
+
+print STDERR "Processing $dat...\n";
+
+# Read the index file, creating an array of objects. Each object
+# represents and indexterm and has fields for the content of the
+# indexterm
+
+open (F, $dat);
+while (<F>) {
+ chop;
+
+ if (/^\/indexterm/i) {
+ push (@term, $idx);
+ next;
+ }
+
+ if (/^indexterm (.*)$/i) {
+ $termcount++;
+ $idx = {};
+ $idx->{'zone'} = {};
+ $idx->{'href'} = $1;
+ $idx->{'count'} = $termcount;
+ $idx->{'scope'} = $impliedscope;
+ next;
+ }
+
+ if (/^indexpoint (.*)$/i) {
+ $idx->{'hrefpoint'} = $1;
+ next;
+ }
+
+ if (/^title (.*)$/i) {
+ $idx->{'title'} = $1;
+ next;
+ }
+
+ if (/^primary[\[ ](.*)$/i) {
+ if (/^primary\[(.*?)\] (.*)$/i) {
+ $idx->{'psortas'} = $1;
+ $idx->{'primary'} = $2;
+ } else {
+ $idx->{'psortas'} = $1;
+ $idx->{'primary'} = $1;
+ }
+ next;
+ }
+
+ if (/^secondary[\[ ](.*)$/i) {
+ if (/^secondary\[(.*?)\] (.*)$/i) {
+ $idx->{'ssortas'} = $1;
+ $idx->{'secondary'} = $2;
+ } else {
+ $idx->{'ssortas'} = $1;
+ $idx->{'secondary'} = $1;
+ }
+ next;
+ }
+
+ if (/^tertiary[\[ ](.*)$/i) {
+ if (/^tertiary\[(.*?)\] (.*)$/i) {
+ $idx->{'tsortas'} = $1;
+ $idx->{'tertiary'} = $2;
+ } else {
+ $idx->{'tsortas'} = $1;
+ $idx->{'tertiary'} = $1;
+ }
+ next;
+ }
+
+ if (/^see (.*)$/i) {
+ $idx->{'see'} = $1;
+ next;
+ }
+
+ if (/^seealso (.*)$/i) {
+ $idx->{'seealso'} = $1;
+ next;
+ }
+
+ if (/^significance (.*)$/i) {
+ $idx->{'significance'} = $1;
+ next;
+ }
+
+ if (/^class (.*)$/i) {
+ $idx->{'class'} = $1;
+ next;
+ }
+
+ if (/^scope (.*)$/i) {
+ $idx->{'scope'} = uc($1);
+ next;
+ }
+
+ if (/^startref (.*)$/i) {
+ $idx->{'startref'} = $1;
+ next;
+ }
+
+ if (/^id (.*)$/i) {
+ $idx->{'id'} = $1;
+ $id{$1} = $idx;
+ next;
+ }
+
+ if (/^zone (.*)$/i) {
+ my($href) = $1;
+ $_ = scalar(<F>);
+ chop;
+ die "Bad zone: $_\n" if !/^title (.*)$/i;
+ $idx->{'zone'}->{$href} = $1;
+ next;
+ }
+
+ die "Unrecognized: $_\n";
+}
+close (F);
+
+print STDERR "$termcount entries loaded...\n";
+
+# Fixup the startrefs...
+# In DocBook, STARTREF is a #CONREF attribute; support this by copying
+# all of the fields from the indexterm with the id specified by STARTREF
+# to the indexterm that has the STARTREF.
+foreach $idx (@term) {
+ my($ididx, $field);
+ if ($idx->{'startref'}) {
+ $ididx = $id{$idx->{'startref'}};
+ foreach $field ('primary', 'secondary', 'tertiary', 'see', 'seealso',
+ 'psortas', 'ssortas', 'tsortas', 'significance',
+ 'class', 'scope') {
+ $idx->{$field} = $ididx->{$field};
+ }
+ }
+}
+
+# Sort the index terms
+@term = sort termsort @term;
+
+# Move all of the non-alphabetic entries to the front of the index.
+@term = sortsymbols(@term);
+
+safe_open(*OUT, $outfile);
+
+# Write the index...
+if ($indexid) {
+ print OUT "<$indextag id='$indexid'>\n\n";
+} else {
+ print OUT "<$indextag>\n\n";
+}
+
+print OUT "<!-- This file was produced by collateindex.pl. -->\n";
+print OUT "<!-- Remove this comment if you edit this file by hand! -->\n";
+
+print OUT "<!-- ULINK is abused here.
+
+ The URL attribute holds the URL that points from the index entry
+ back to the appropriate place in the output produced by the HTML
+ stylesheet. (It's much easier to calculate this URL in the first
+ pass.)
+
+ The Role attribute holds the ID (either real or manufactured) of
+ the corresponding INDEXTERM. This is used by the print backends
+ to produce page numbers.
+
+ The entries below are sorted and collated into the correct order.
+ Duplicates may be removed in the HTML backend, but in the print
+ backends, it is impossible to suppress duplicate pages or coalesce
+ sequences of pages into a range.
+-->\n\n";
+
+print OUT "<title>$title</title>\n\n" if $title;
+
+$last = {}; # the last indexterm we processed
+$first = 1; # this is the first one
+$group = ""; # we're not in a group yet
+$lastout = ""; # we've not put anything out yet
+
+foreach $idx (@term) {
+ next if $idx->{'startref'}; # no way to represent spans...
+ next if ($idx->{'scope'} eq 'LOCAL') && ($scope eq 'GLOBAL');
+ next if ($idx->{'scope'} eq 'GLOBAL') && ($scope eq 'LOCAL');
+ next if &same($idx, $last); # suppress duplicates
+
+ $termcount--;
+
+ # If primary changes, output a whole new index term, otherwise just
+ # output another secondary or tertiary, as appropriate. We know from
+ # sorting that the terms will always be in the right order.
+ if (!&tsame($last, $idx, 'primary')) {
+ print "DIFF PRIM\n" if $debug;
+ &end_entry() if not $first;
+
+ if ($lettergroups) {
+ # If we're grouping, make the right indexdivs
+ $letter = $idx->{'psortas'};
+ $letter = $idx->{'primary'} if !$letter;
+ $letter = uc(substr($letter, 0, 1));
+
+ # symbols are a special case
+ if (($letter lt 'A') || ($letter gt 'Z')) {
+ if (($group eq '')
+ || (($group ge 'A') && ($group le 'Z'))) {
+ print OUT "</indexdiv>\n" if !$first;
+ print OUT "<indexdiv><title>$symbolsname</title>\n\n";
+ $group = $letter;
+ }
+ } elsif (($group eq '') || ($group ne $letter)) {
+ print OUT "</indexdiv>\n" if !$first;
+ print OUT "<indexdiv><title>$letter</title>\n\n";
+ $group = $letter;
+ }
+ }
+
+ $first = 0; # there can only be on first ;-)
+
+ print OUT "<indexentry>\n";
+ print OUT " <primaryie>", $idx->{'primary'};
+ $lastout = "primaryie";
+
+ if ($idx->{'secondary'}) {
+ print OUT "\n </primaryie>\n";
+ print OUT " <secondaryie>", $idx->{'secondary'};
+ $lastout = "secondaryie";
+ };
+
+ if ($idx->{'tertiary'}) {
+ print OUT "\n </secondaryie>\n";
+ print OUT " <tertiaryie>", $idx->{'tertiary'};
+ $lastout = "tertiaryie";
+ }
+ } elsif (!&tsame($last, $idx, 'secondary')) {
+ print "DIFF SEC\n" if $debug;
+
+ print OUT "\n </$lastout>\n" if $lastout;
+
+ print OUT " <secondaryie>", $idx->{'secondary'};
+ $lastout = "secondaryie";
+ if ($idx->{'tertiary'}) {
+ print OUT "\n </secondaryie>\n";
+ print OUT " <tertiaryie>", $idx->{'tertiary'};
+ $lastout = "tertiaryie";
+ }
+ } elsif (!&tsame($last, $idx, 'tertiary')) {
+ print "DIFF TERT\n" if $debug;
+
+ print OUT "\n </$lastout>\n" if $lastout;
+
+ if ($idx->{'tertiary'}) {
+ print OUT " <tertiaryie>", $idx->{'tertiary'};
+ $lastout = "tertiaryie";
+ }
+ }
+
+ &print_term($idx);
+
+ $last = $idx;
+}
+
+# Termcount is > 0 iff some entries were skipped.
+print STDERR "$termcount entries ignored...\n";
+
+&end_entry();
+
+print OUT "</indexdiv>\n" if $lettergroups;
+print OUT "</$indextag>\n";
+
+close (OUT);
+
+print STDERR "Done.\n";
+
+sub same {
+ my($a) = shift;
+ my($b) = shift;
+
+ my($aP) = $a->{'psortas'} || $a->{'primary'};
+ my($aS) = $a->{'ssortas'} || $a->{'secondary'};
+ my($aT) = $a->{'tsortas'} || $a->{'tertiary'};
+
+ my($bP) = $b->{'psortas'} || $b->{'primary'};
+ my($bS) = $b->{'ssortas'} || $b->{'secondary'};
+ my($bT) = $b->{'tsortas'} || $b->{'tertiary'};
+
+ my($same);
+
+ $aP =~ s/^\s*//; $aP =~ s/\s*$//; $aP = uc($aP);
+ $aS =~ s/^\s*//; $aS =~ s/\s*$//; $aS = uc($aS);
+ $aT =~ s/^\s*//; $aT =~ s/\s*$//; $aT = uc($aT);
+ $bP =~ s/^\s*//; $bP =~ s/\s*$//; $bP = uc($bP);
+ $bS =~ s/^\s*//; $bS =~ s/\s*$//; $bS = uc($bS);
+ $bT =~ s/^\s*//; $bT =~ s/\s*$//; $bT = uc($bT);
+
+# print "[$aP]=[$bP]\n";
+# print "[$aS]=[$bS]\n";
+# print "[$aT]=[$bT]\n";
+
+ # Two index terms are the same if:
+ # 1. the primary, secondary, and tertiary entries are the same
+ # (or have the same SORTAS)
+ # AND
+ # 2. They occur in the same titled section
+ # AND
+ # 3. They point to the same place
+ #
+ # Notes: Scope is used to suppress some entries, but can't be used
+ # for comparing duplicates.
+ # Interpretation of "the same place" depends on whether or
+ # not $linkpoints is true.
+
+ $same = (($aP eq $bP)
+ && ($aS eq $bS)
+ && ($aT eq $bT)
+ && ($a->{'title'} eq $b->{'title'})
+ && ($a->{'href'} eq $b->{'href'}));
+
+ # If we're linking to points, they're only the same if they link
+ # to exactly the same spot. (surely this is redundant?)
+ $same = $same && ($a->{'hrefpoint'} eq $b->{'hrefpoint'})
+ if $linkpoints;
+
+ $same;
+}
+
+sub tsame {
+ # Unlike same(), tsame only compares a single term
+ my($a) = shift;
+ my($b) = shift;
+ my($term) = shift;
+ my($sterm) = substr($term, 0, 1) . "sortas";
+ my($A, $B);
+
+ $A = $a->{$sterm} || $a->{$term};
+ $B = $b->{$sterm} || $b->{$term};
+
+ $A =~ s/^\s*//; $A =~ s/\s*$//; $A = uc($A);
+ $B =~ s/^\s*//; $B =~ s/\s*$//; $B = uc($B);
+
+ return $A eq $B;
+}
+
+sub end_entry {
+ # End any open elements...
+ print OUT "\n </$lastout>\n" if $lastout;
+ print OUT "</indexentry>\n\n";
+ $lastout = "";
+}
+
+sub print_term {
+ # Print out the links for an indexterm. There can be more than
+ # one if the term has a ZONE that points to more than one place.
+ # (do we do the right thing in that case?)
+ my($idx) = shift;
+ my($key, $indent, @hrefs);
+ my(%href) = ();
+ my(%phref) = ();
+
+ $indent = " ";
+
+ if ($idx->{'see'}) {
+ # it'd be nice to make this a link...
+ if ($lastout) {
+ print OUT "\n </$lastout>\n";
+ $lastout = "";
+ }
+ print OUT $indent, "<seeie>", $idx->{'see'}, "</seeie>\n";
+ return;
+ }
+
+ if ($idx->{'seealso'}) {
+ # it'd be nice to make this a link...
+ if ($lastout) {
+ print OUT "\n </$lastout>\n";
+ $lastout = "";
+ }
+ print OUT $indent, "<seealsoie>", $idx->{'seealso'}, "</seealsoie>\n";
+ return;
+ }
+
+ if (keys %{$idx->{'zone'}}) {
+ foreach $key (keys %{$idx->{'zone'}}) {
+ $href{$key} = $idx->{'zone'}->{$key};
+ $phref{$key} = $idx->{'zone'}->{$key};
+ }
+ } else {
+ $href{$idx->{'href'}} = $idx->{'title'};
+ $phref{$idx->{'href'}} = $idx->{'hrefpoint'};
+ }
+
+ # We can't use <LINK> because we don't know the ID of the term in the
+ # original source (and, in fact, it might not have one).
+ print OUT ",\n";
+ @hrefs = keys %href;
+ while (@hrefs) {
+ my($linkend) = "";
+ my($role) = "";
+ $key = shift @hrefs;
+ if ($linkpoints) {
+ $linkend = $phref{$key};
+ } else {
+ $linkend = $key;
+ }
+
+ $role = $linkend;
+ $role = $1 if $role =~ /\#(.*)$/;
+
+ print OUT $indent;
+ print OUT "<ulink url=\"$linkend\" role=\"$role\">";
+ print OUT "<emphasis>" if ($idx->{'significance'} eq 'PREFERRED');
+ print OUT $href{$key};
+ print OUT "</emphasis>" if ($idx->{'significance'} eq 'PREFERRED');
+ print OUT "</ulink>";
+ }
+}
+
+sub termsort {
+ my($aP) = $a->{'psortas'} || $a->{'primary'};
+ my($aS) = $a->{'ssortas'} || $a->{'secondary'};
+ my($aT) = $a->{'tsortas'} || $a->{'tertiary'};
+ my($ap) = $a->{'count'};
+
+ my($bP) = $b->{'psortas'} || $b->{'primary'};
+ my($bS) = $b->{'ssortas'} || $b->{'secondary'};
+ my($bT) = $b->{'tsortas'} || $b->{'tertiary'};
+ my($bp) = $b->{'count'};
+
+ $aP =~ s/^\s*//; $aP =~ s/\s*$//; $aP = uc($aP);
+ $aS =~ s/^\s*//; $aS =~ s/\s*$//; $aS = uc($aS);
+ $aT =~ s/^\s*//; $aT =~ s/\s*$//; $aT = uc($aT);
+ $bP =~ s/^\s*//; $bP =~ s/\s*$//; $bP = uc($bP);
+ $bS =~ s/^\s*//; $bS =~ s/\s*$//; $bS = uc($bS);
+ $bT =~ s/^\s*//; $bT =~ s/\s*$//; $bT = uc($bT);
+
+ if ($aP eq $bP) {
+ if ($aS eq $bS) {
+ if ($aT eq $bT) {
+ # make sure seealso's always sort to the bottom
+ return 1 if ($a->{'seealso'});
+ return -1 if ($b->{'seealso'});
+ # if everything else is the same, keep these elements
+ # in document order (so the index links are in the right
+ # order)
+ return $ap <=> $bp;
+ } else {
+ return $aT cmp $bT;
+ }
+ } else {
+ return $aS cmp $bS;
+ }
+ } else {
+ return $aP cmp $bP;
+ }
+}
+
+sub sortsymbols {
+ my(@term) = @_;
+ my(@new) = ();
+ my(@sym) = ();
+ my($letter);
+ my($idx);
+
+ # Move the non-letter things to the front. Should digits be thier
+ # own group? Maybe...
+ foreach $idx (@term) {
+ $letter = $idx->{'psortas'};
+ $letter = $idx->{'primary'} if !$letter;
+ $letter = uc(substr($letter, 0, 1));
+
+ if (($letter lt 'A') || ($letter gt 'Z')) {
+ push (@sym, $idx);
+ } else {
+ push (@new, $idx);
+ }
+ }
+
+ return (@sym, @new);
+}
+
+sub safe_open {
+ local(*OUT) = shift;
+ local(*F, $_);
+
+ if (($outfile ne '-') && (!$forceoutput)) {
+ my($handedit) = 1;
+ if (open (OUT, $outfile)) {
+ while (<OUT>) {
+ if (/<!-- Remove this comment if you edit this file by hand! -->/){
+ $handedit = 0;
+ last;
+ }
+ }
+ close (OUT);
+ } else {
+ $handedit = 0;
+ }
+
+ if ($handedit) {
+ print "\n$outfile appears to have been edited by hand; use -f or\n";
+ print " change the output file.\n";
+ exit 1;
+ }
+ }
+
+ open (OUT, ">$outfile") || die "$usage\nCannot write to $outfile.\n";
+
+ if ($preamble) {
+ # Copy the preamble
+ if (open(F, $preamble)) {
+ while (<F>) {
+ print OUT $_;
+ }
+ close(F);
+ } else {
+ warn "$0: cannot open preamble $preamble.\n";
+ }
+ }
+}