#!/usr/bin/perl
#
# add_ids.pl
#
# Eugene Eric Kim <eekim@eekim.com>
# http://www.eekim.com/software/purple/
#
# $Id: add_ids.pl,v 1.10 2001/08/10 00:45:40 eekim Exp $
#
# Copyright (c) Eugene Eric Kim 2000-2001.  All rights reserved.
# See COPYING for licensing terms.

=head1 NAME

add_ids.pl - Add statement IDs and hierarchical addresses to an
XML file.

=head1 SYNOPSIS

Usage:
  add_ids.pl [-s] [-r rules.purple] file.xml

=head1 DESCRIPTION

Parses the XML file, adds NIDs to nodes that do not already have one,
and computes and adds hierarchical addresses to all nodes using a
configurable set of rules.  Saves the original XML file to file.xml~.

If a rules file is not specified, defaults to rules for purple.dtd.

This script is not very robust.  It does not validate the XML file,
and it does not handle errors well.  So be careful.

=cut

use lib '/home/eekim/perl';
use strict;
use IO::File;
use File::Copy;
use Getopt::Long;
use Purple::HID;
use XML::DOM;

my $rules_file;
my $on_nid_only = 0;
GetOptions('s!'=>\$on_nid_only, 'r=s'=>\$rules_file);

my $xml_file = $ARGV[0];
if (!$xml_file) {
  print <<EOM;
Usage:
  $0 [-s] [-r rules.purple] file.xml

where file.xml is an XML file.  If no optional rules.purple file is
specified, the default rules for the Purple vocabulary (purple.dtd)
are used.  Options are:

  -s    Only add NIDs.

  -r    Use ID generation rules in external file rules.purple.
EOM
  exit;
}

File::Copy::copy($xml_file, "$xml_file~");
&XML::DOM::setTagCompression(\&my_tag_compression);
my $parser = new XML::DOM::Parser;
my $doc = $parser->parsefile("$xml_file~");

### element rules

# default values for purple.dtd
my $tag_contentroot = 'purple';
my $tag_lastnid = 'lastnid';
my %tags_with_nids = ('h'=>1,'p'=>1,'item'=>1,'example'=>1,'figure'=>1);
my %start_tags_recurse = ($tag_contentroot=>1,'section'=>1, 'list'=>1);
my %start_tags_spawn = ('section'=>1, 'list'=>1);
my %start_tags_mark_spawn;
my %start_tags_kill_mark_spawn = ('h'=>1);
my %start_tags_mark_incr = ('p'=>1,'item'=>1,'example'=>1,'figure'=>1);
my %end_tags_kill_incr = ('section'=>1, 'list'=>1);

# override default rules with values in rules file, if it exists
if (-e $rules_file) {
  my $fh = new IO::File $rules_file;
  if (defined $fh) {
    undef $tag_contentroot;
    undef $tag_lastnid;
    undef %tags_with_nids;
    undef %start_tags_recurse;
    undef %start_tags_spawn;
    undef %start_tags_mark_spawn;
    undef %start_tags_kill_mark_spawn;
    undef %start_tags_mark_incr;
    undef %end_tags_kill_incr;
    while (my $line = <$fh>) {
      chomp $line;
      if ($line =~ /^([A-Z_]+)=([A-Za-z,]+)$/) {
        my $var_name = $1;
        my $var_value = $2;
        if ($var_name eq 'CONTENT_ROOT') {
          $tag_contentroot = $var_value;
        }
        elsif ($var_name eq 'LAST_NID') {
          $tag_lastnid = $var_value;
        }
        elsif ($var_name eq 'TAGS_WITH_NIDS') {
          %tags_with_nids = map { $_ => 1 } split(',', $var_value);
        }
        elsif ($var_name eq 'ON_START_TAGS_RECURSE') {
          %start_tags_recurse = map { $_ => 1 } split(',', $var_value);
        }
        elsif ($var_name eq 'ON_START_TAGS_SPAWN') {
          %start_tags_spawn = map { $_ => 1 } split(',', $var_value);
        }
        elsif ($var_name eq 'ON_START_TAGS_MARK_SPAWN') {
          %start_tags_mark_spawn = map { $_ => 1 } split(',', $var_value);
        }
        elsif ($var_name eq 'ON_START_TAGS_KILL_MARK_SPAWN') {
          %start_tags_kill_mark_spawn = map { $_ => 1 } split(',', $var_value);
        }
        elsif ($var_name eq 'ON_START_TAGS_MARK_INCREMENT') {
          %start_tags_mark_incr = map { $_ => 1 } split(',', $var_value);
        }
        elsif ($var_name eq 'ON_END_TAGS_KILL_INCREMENT') {
          %end_tags_kill_incr = map { $_ => 1 } split(',', $var_value);
        }
      }
    }
    $start_tags_recurse{$tag_contentroot} = 1;
  }
  $fh->close;
}

### find largest NID

my $current_nid = 0;
my $last_nid = 0;
my @nodes;
my $node;

foreach my $tag (keys %tags_with_nids) {
  $current_nid = &find_largest_nid($tag, $current_nid);
}
$current_nid++;
if ($tag_lastnid) {
  @nodes = ($doc->getElementsByTagName($tag_lastnid))[0]->getChildNodes;
  $last_nid = $nodes[0]->getData if (@nodes);
  if ($last_nid > $current_nid) {
    $current_nid = $last_nid + 1;
  }
}

### walk tree and add NIDs and HIDs (hierarchical addresss)

my $hid = new Purple::HID unless ($on_nid_only);

&traverse(($doc->getElementsByTagName($tag_contentroot))[0]);

### update lastnid element

if ($tag_lastnid) {
  $node = ($doc->getElementsByTagName($tag_lastnid))[0];
  @nodes = $node->getChildNodes;
  if (@nodes) {
    $nodes[0]->setData($current_nid - 1);
  }
  else {  # add lastnid element
    $node->insertBefore($doc->createTextNode($current_nid - 1), undef);
  }
}

### print to file

$doc->printToFile("$xml_file");

# fini

### subroutines

sub find_largest_nid {
  my $element_name = shift;
  my ($current_nid) = @_;
  my @nodes = $doc->getElementsByTagName($element_name);

  foreach my $node (@nodes) {
    my $nid = $node->getAttribute('nid');
    if ($nid > $current_nid) {
      $current_nid = $nid;
    }
  }
  return $current_nid;
}

sub traverse {
  my $node = shift;
  my $nodename = $node->getNodeName;

  if ($start_tags_recurse{$nodename}) {
    &update_ids_start_tags($node);
    foreach my $child ($node->getChildNodes) {
      &traverse($child);
    }
    &update_ids_endtags($nodename);
  }
  elsif ($tags_with_nids{$nodename}) {
    &update_ids_start_tags($node);
  }
}

sub update_ids_start_tags {
  my $node = shift;
  my $nodename = $node->getNodeName;

  if ($start_tags_spawn{$nodename}) {
    $hid->spawn unless ($on_nid_only);
  }
  elsif ($start_tags_mark_spawn{$nodename}) {
    &add_ids($node);
    $hid->spawn unless ($on_nid_only);
  }
  elsif ($start_tags_kill_mark_spawn{$nodename}) {
    $hid->kill unless ($on_nid_only);
    &add_ids($node);
    $hid->spawn unless ($on_nid_only);
  }
  elsif ($start_tags_mark_incr{$nodename}) {
    &add_ids($node);
    $hid->incr unless ($on_nid_only);
  }
}

sub update_ids_endtags {
  my $nodename = shift;

  if ($end_tags_kill_incr{$nodename}) {
    $hid->kill unless ($on_nid_only);
    $hid->incr unless ($on_nid_only);
  }
}

sub add_ids {
  my $node = shift;

  my $nid = $node->getAttribute('nid');
  if (!$nid) {
    $node->setAttribute('nid', "0$current_nid");
    $current_nid++;
  }
  $node->setAttribute('hid', $hid->string) unless ($on_nid_only);
}

sub my_tag_compression {
  my ($tag, $elem) = @_;

  return 1 if ($tag =~ /^p$/);
  return 0;
}

=head1 RULES-BASED ID GENERATION

add_ids.pl uses a depth-first style algorithm for traversing the DOM
tree and computing and adding the appropriate IDs.  You can define
files containing rules that add_ids.pl will use to determine when and
how to increment the hierarchical ID (HID).

The rules are defined as follows:

    RULE_NAME=tag1,tag2,tag3

where RULE_NAME is the name of the rule, and tag1, tag2, tag3... are
the elements to which the rule applies.

There are nine rules:

=head2 CONTENT_ROOT

The top-level element.  add_ids.pl starts parsing with this element.
Normally, this is the document root element, although in some cases,
you can specify a lower-level element to reduce the amount of parsing.
For example, if you were using add_ids.pl on XHTML, you could specify
<xhtml> as the CONTENT_ROOT, but it is better to specify <body>,
because all of the elements that will obtain IDs are contained within
the <body> elements.

=head2 LAST_NID

If the XML vocabulary reserves an element to store the last NID used
in the lifespan of the document, list that here.  This is an optional
rule.  If it is not defined, add_ids.pl uses the largest available NID
currently in the document.

=head2 TAGS_WITH_NIDS

List every tag that will contain an ID in this rule.

=head2 ON_START_TAGS_RECURSE

When add_ids.pl sees tags from this rule, it will recurse one level
and examine the children elements as wel.

=head2 ON_START_TAGS_SPAWN

On seeing tags defined here, spawn a new level of the HID.  For
instance, if:

    ON_START_TAGS_SPAWN=list

and if the HID is 2C when add_ids.pl comes across a <list> element,
add_ids.pl will change the HID to 2C1.

=head2 ON_START_TAGS_MARK_SPAWN

On seeing tags defined here, sets the NID and HID attributes, and
spawns a new level of the HID.

=head2 ON_START_TAGS_KILL_MARK_SPAWN

Kills the current level of the HID, sets the NID and HID attributes,
and spawns that level of the HID again.

=head2 ON_START_TAGS_MARK_INCREMENT

Sets the NID and HID attributes of the tags defined here, and
increments the current HID.

=head2 ON_END_TAGS_KILL_INCREMENT

On seeing the end tag of the elements listed here, kills the current
level of the HID, and then increments it.  For instance, if:

    ON_END_TAGS_KILL_INCREMENT=list

and if the HID is 2C1 when the program sees </list>, add_ids.pl will
change the HID to 2D.

=head1 SAMPLE RULES FILE

Here is the default rules file used for purple.dtd:

  # rules.purple

  CONTENT_ROOT=purple
  LAST_NID=lastnid
  TAGS_WITH_NIDS=h,p,item,example,figure
  ON_START_TAGS_RECURSE=section,list
  ON_START_TAGS_SPAWN=section,list
  ON_START_TAGS_MARK_SPAWN=
  ON_START_TAGS_KILL_MARK_SPAWN=h
  ON_START_TAGS_MARK_INCREMENT=p,item,example,figure
  ON_END_TAGS_KILL_INCREMENT=section,list

=head1 AUTHOR

Eugene Eric Kim <eekim@eekim.com>

=cut

