#! /usr/bin/perl

# This script upgrades PhotoML files to the most recent DTD versions

# Copyright © 2003-2010 Brendt Wohlberg <photoml@wohlberg.net>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License at
# http://www.gnu.org/licenses/gpl-2.0.txt.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.

# Most recent modification: 22 October 2010

use strict;
use File::Basename;
use File::Copy;
use Getopt::Std;

# Set up path variables
my $pmlpath = dirname($0) . "/..";
my $xsl = "$pmlpath/xsl/upgrade";
my $tmp = '/tmp';
if (-r '/etc/xml/catalog' and $ENV{'XML_CATALOG_FILES'} eq '') {
  $ENV{'XML_CATALOG_FILES'} = "/etc/xml/catalog";
}
$ENV{'XML_CATALOG_FILES'} = "$pmlpath/dtd/catalog.xml " .
                            $ENV{'XML_CATALOG_FILES'};
undef $ENV{'SGML_CATALOG_FILES'};


# Ensure xsltproc and xmllint are available
die "pmlupgrade: error executing xstlproc\n"
    if (`which xsltproc 2>/dev/null` eq '');
die "pmlupgrade: error executing xmllint\n"
    if (`which xmllint 2>/dev/null` eq '');

my $ustr = <<EOF;
usage: pmlupgrade [-h] infile [infile] ...
       -h Display usage information
       -n Preserve numeric character references
EOF
my $opts = {};
getopts('hn', $opts);
if ($opts->{'h'}) { print STDERR "$ustr"; exit(0); }
die "$ustr" if (@ARGV == 0);

# Declare variables used within main loop
my $nerr = 0;
my ($x, $tmp0, $tmp1, $tflag, $trans, $cmd, $dtd, $root, $path, $sra);
my ($dtdcl, $dtpub, $dttyp, $dtver, $dtint);
# Work through each file specified on the command line
foreach $x ( @ARGV ) {
  # Ensure current file is readable
  if (! -r $x) {
    warn "pmlupgrade: could not read file $x\n";
    $nerr++;
    next;
  }
  # Determine document type information
  $dtdcl = getdoctyp($x);
  $dtpub = getdtpub($dtdcl);
  ($dttyp,$dtver) = getpubtypver($dtpub);
  $dtint = getdtint($dtdcl);
  # Determine the relevant DTD and skip to next file if not recognised
  if ($dttyp eq "PhotoML") {
    $dtd = 'photo';
    $root = 'photo';
  } elsif ($dttyp eq "DigImageML") {
    $dtd = 'digim';
    $root = 'digimage';
  } else {
    warn "pmlupgrade: unrecognised document type for file $x\n";
    $nerr++;
    $dtd = '';
    next;
  }

  warn "Upgrading $x\n";
  $path = dirname($x);
  # Flag indicating whether current file has been transformed
  $tflag = 0;
  # Find the relevant XSL upgrade transform
  $trans = `find $xsl -name "$dtd-$dtver-*.xsl"`;
  chomp($trans);
  # Temporary file names
  $tmp0 = "$tmp/upgrade.$$.0.xml";
  $tmp1 = "$tmp/upgrade.$$.1.xml";

  # If DTD is one of the versions with ISOlat1 and ISOlat2 character
  # entity references, convert them to the corresponding numeric
  # character references
  if ($dtver <= "0.11") {
    substncr($x, $tmp0);
  } else {
    copy($x,$tmp0);
  }

  $sra = ($opts->{'n'})?"'s/\&/\&amp;/g'":"'s/\\&\\([^#]\\)/\\&amp;\\1/g'";

  # Increment version numbers while transforms available
  while ($trans ne '') {
    # Preprocessing to prevent namespace errors (since the DTD, in
    # which the namespace prefixes are defined, is not loaded), and to
    # prevent parsing and replacement of entity definitions.
    $cmd = "sed -e 's|<$root>|<$root xmlns:xlink=\"http://www.w3.org".
           "/1999/xlink\">|g' -e $sra $tmp0 > $tmp1";
    system($cmd);
    move($tmp1,$tmp0);

    # Remove invalid namespace prefix from merge reject PI when upgrading
    # from DTD version 0.12
    if ($dtver == "0.12") {
      $cmd = "sed -e 's|<\?merge:reject|<\?merge-reject|g' $tmp0 > $tmp1";
      system($cmd);
      move($tmp1,$tmp0);
    }

    # Apply the XSL transform
    $cmd = "xsltproc --path $path --novalid -o $tmp1 $trans $tmp0";
    system($cmd);
    move($tmp1,$tmp0);
    # In applying the last upgrade for digim files, the root element
    # is changed to photo
    $root = 'photo' if ($dtd eq 'digim' && $dtver eq '0.02');
    # Undo preprocessing changes
    $cmd = "sed -e 's|<$root xmlns:xlink=\"http://www.w3.org/1999/".
             "xlink\">|<$root>|g' -e 's/\&amp;/\\&/g' $tmp0 > $tmp1";
    system($cmd);
    move($tmp1,$tmp0);

    # Find the next relevant XSL transform
    if ($trans =~ /-([\d\.]+)\.xsl$/) {
      $dtver = $1;
    } else {
      die "pmlupgrade: error in accessing upgrade XSL transforms\n";
    }
    $trans = `find $xsl -name "$dtd-$dtver-*.xsl"`;
    chomp($trans);
    $tflag = 1;
  }

  # If an upgrade transform has been applied, insert the internal
  # subset removed by XSL processing, and validate the result
  if ($tflag == 1) {
    setdtint($dtint, $tmp0, "$x.tmp");
    $cmd = "xmllint --valid --noout $x.tmp 2>/dev/null";
    if (!system($cmd)) {
      move($x, "$x.bak");
      move("$x.tmp",$x);
    } else {
      move("$x.tmp","$x.fail");
      warn "pmlupgrade: validity error in upgrading file $x\n";
      $nerr++;
    }
  }

  # Remove temporary files
  unlink($tmp0);
  unlink($tmp1);
}

exit ($nerr > 0)?1:0;


# Parse strings delimited by (possibly nested) angle brackets
sub abmatch {
  my $fh = shift;
  my $lt = shift;

  my $mt = '';
  my $line;
  my $n = 1;
  while ($n > 0) {
    while (($lt !~ /<|>/) and ($line = <$fh>)) {
      $lt .= $line;
    }
    if ($lt =~ /<|>/) {
      $mt .= $` . $&;
      $lt = $';
      $n = ($& eq ">")?($n-1):($n+1);
    } else {
      return undef;
    }
  }
  return $mt;
}


# Extract the document type declaration from an XML file
sub getdoctyp {
  my $fn = shift;

  my $line;
  open(FH, "< $fn");
  my $fh = *FH;
  while ($line = <$fh>) {
    last if ($line =~ /<!DOCTYPE/);
  }
  if ($line =~ /<!DOCTYPE/) {
    return $& . abmatch($fh, $');
  } else {
    return undef;
  }
}


# Extract the public identifier from a document type declaration
sub getdtpub {
  my $dt = shift;

  if ($dt =~ /DOCTYPE\s+\w+\s+PUBLIC\s+\"([^\"]+)\"/) {
    return $1;
  } else {
    return undef;
  }
}


# Extract the type and version number from a PhotoML/DigImageML file
sub getpubtypver {
  my $dtp = shift;

  if ($dtp =~ /DTD\s+(\w+)\s+([\d|\.]+)\/\/\w{2}$/) {
    return ($1,$2);
  } else {
    return undef;
  }
}


# Extract the internal subset from a document type declaration
sub getdtint {
  my $dt = shift;

  if ($dt =~ /\[([^\]]+)\]/) {
    return $1;
  } else {
    return undef;
  }
}


# Insert an internal subset into the document type declaration in a
# XML file
sub setdtint {
  my $iss = shift;
  my $fn0 = shift;
  my $fn1 = shift;

  if ($iss eq '') {
    copy($fn0, $fn1);
    return 1;
  }
  local(*FH, $/);
  open(FH, "<$fn0") or return undef;
  my $ft = <FH>;
  close(FH);
  $ft =~ s/(<!DOCTYPE\s+[^>]+)>/$1 \[$iss\]>/;
  open(FH, ">$fn1") or return undef;
  print FH $ft;
  close(FH);
  return 1;
}


# Replace ISOlat1 and ISOlat2 character entity references with the
# corresponding numeric character references
sub substncr {
  my $fn0 = shift;
  my $fn1 = shift;

  open(IFH, "<$fn0") or return undef;
  open(OFH, ">$fn1") or return undef;
  my $line;
  while ($line = <IFH>) {
    $line =~ s/\&([^\;]+)\;/@{[getncr($1)]}/g;
    print OFH $line;
  }
  close(OFH);
  close(IFH);
  return 1;
}


# Convert ISOlat1 and ISOlat2 character entity references to numeric
# character references, returning other entity references unchanged
sub getncr {
  my $cer = shift;

  my $ncr = {"Agrave" => "&#192;", "Aacute" => "&#193;", "Acirc"  => "&#194;",
        "Atilde" => "&#195;" , "Auml"   => "&#196;" , "Aring"  => "&#197;" ,
        "AElig"  => "&#198;" , "Ccedil" => "&#199;" , "Egrave" => "&#200;" ,
        "Eacute" => "&#201;" , "Ecirc"  => "&#202;" , "Euml"   => "&#203;" ,
        "Igrave" => "&#204;" , "Iacute" => "&#205;" , "Icirc"  => "&#206;" ,
        "Iuml"   => "&#207;" , "ETH"    => "&#208;" , "Ntilde" => "&#209;" ,
        "Ograve" => "&#210;" , "Oacute" => "&#211;" , "Ocirc"  => "&#212;" ,
        "Otilde" => "&#213;" , "Ouml"   => "&#214;" , "Oslash" => "&#216;" ,
        "Ugrave" => "&#217;" , "Uacute" => "&#218;" , "Ucirc"  => "&#219;" ,
        "Uuml"   => "&#220;" , "Yacute" => "&#221;" , "THORN"  => "&#222;" ,
        "szlig"  => "&#223;" , "agrave" => "&#224;" , "aacute" => "&#225;" ,
        "acirc"  => "&#226;" , "atilde" => "&#227;" , "auml"   => "&#228;" ,
        "aring"  => "&#229;" , "aelig"  => "&#230;" , "ccedil" => "&#231;" ,
        "egrave" => "&#232;" , "eacute" => "&#233;" , "ecirc"  => "&#234;" ,
        "euml"   => "&#235;" , "igrave" => "&#236;" , "iacute" => "&#237;" ,
        "icirc"  => "&#238;" , "iuml"   => "&#239;" , "eth"    => "&#240;" ,
        "ntilde" => "&#241;" , "ograve" => "&#242;" , "oacute" => "&#243;" ,
        "ocirc"  => "&#244;" , "otilde" => "&#245;" , "ouml"   => "&#246;" ,
        "oslash" => "&#248;" , "ugrave" => "&#249;" , "uacute" => "&#250;" ,
        "ucirc"  => "&#251;" , "uuml"   => "&#252;" , "yacute" => "&#253;" ,
        "thorn"  => "&#254;" , "yuml"   => "&#255;" , "Amacr"  => "&#x100;",
        "amacr"  => "&#x101;", "Abreve" => "&#x102;", "abreve" => "&#x103;",
        "Aogon"  => "&#x104;", "aogon"  => "&#x105;", "Cacute" => "&#x106;",
        "cacute" => "&#x107;", "Ccirc"  => "&#x108;", "ccirc"  => "&#x109;",
        "Cdot"   => "&#x10A;", "cdot"   => "&#x10B;", "Ccaron" => "&#x10C;",
        "ccaron" => "&#x10D;", "Dcaron" => "&#x10E;", "dcaron" => "&#x10F;",
        "Dstrok" => "&#x110;", "dstrok" => "&#x111;", "Emacr"  => "&#x112;",
        "emacr"  => "&#x113;", "Edot"   => "&#x116;", "edot"   => "&#x117;",
        "Eogon"  => "&#x118;", "eogon"  => "&#x119;", "Ecaron" => "&#x11A;",
        "ecaron" => "&#x11B;", "Gcirc"  => "&#x11C;", "gcirc"  => "&#x11D;",
        "Gbreve" => "&#x11E;", "gbreve" => "&#x11F;", "Gdot"   => "&#x120;",
        "gdot"   => "&#x121;", "Gcedil" => "&#x122;", "Hcirc"  => "&#x124;",
        "hcirc"  => "&#x125;", "Hstrok" => "&#x126;", "hstrok" => "&#x127;",
        "Itilde" => "&#x128;", "itilde" => "&#x129;", "Imacr"  => "&#x12A;",
        "imacr"  => "&#x12B;", "Iogon"  => "&#x12E;", "iogon"  => "&#x12F;",
        "Idot"   => "&#x130;", "inodot" => "&#x131;", "IJlig"  => "&#x132;",
        "ijlig"  => "&#x133;", "Jcirc"  => "&#x134;", "jcirc"  => "&#x135;",
        "Kcedil" => "&#x136;", "kcedil" => "&#x137;", "kgreen" => "&#x138;",
        "Lacute" => "&#x139;", "Lmidot" => "&#x139;", "lacute" => "&#x13A;",
        "Lcedil" => "&#x13B;", "lcedil" => "&#x13C;", "Lcaron" => "&#x13D;",
        "lcaron" => "&#x13E;", "lmidot" => "&#x140;", "Lstrok" => "&#x141;",
        "lstrok" => "&#x142;", "Nacute" => "&#x143;", "nacute" => "&#x144;",
        "Ncedil" => "&#x145;", "ncedil" => "&#x146;", "Ncaron" => "&#x147;",
        "ncaron" => "&#x148;", "napos"  => "&#x149;", "ENG"    => "&#x14A;",
        "eng"    => "&#x14B;", "Omacr"  => "&#x14C;", "omacr"  => "&#x14D;",
        "Odblac" => "&#x150;", "odblac" => "&#x151;", "OElig"  => "&#x152;",
        "oelig"  => "&#x153;", "Racute" => "&#x154;", "racute" => "&#x155;",
        "Rcedil" => "&#x156;", "rcedil" => "&#x157;", "Rcaron" => "&#x158;",
        "rcaron" => "&#x159;", "Sacute" => "&#x15A;", "sacute" => "&#x15B;",
        "scirc"  => "&#x15C;", "Scirc"  => "&#x15D;", "Scedil" => "&#x15E;",
        "scedil" => "&#x15F;", "Scaron" => "&#x160;", "scaron" => "&#x161;",
        "tcedil" => "&#x162;", "Tcedil" => "&#x163;", "Tcaron" => "&#x164;",
        "tcaron" => "&#x165;", "Tstrok" => "&#x166;", "tstrok" => "&#x167;",
        "Utilde" => "&#x168;", "utilde" => "&#x169;", "Umacr"  => "&#x16A;",
        "umacr"  => "&#x16B;", "Ubreve" => "&#x16C;", "ubreve" => "&#x16D;",
        "Uring"  => "&#x16E;", "uring"  => "&#x16F;", "Udblac" => "&#x170;",
        "udblac" => "&#x171;", "Uogon"  => "&#x172;", "uogon"  => "&#x173;",
        "Wcirc"  => "&#x174;", "wcirc"  => "&#x175;", "Ycirc"  => "&#x176;",
        "ycirc"  => "&#x177;", "Yuml"   => "&#x178;", "Zacute" => "&#x179;",
        "zacute" => "&#x17A;", "Zdot"   => "&#x17B;", "zdot"   => "&#x17C;",
        "Zcaron" => "&#x17D;", "zcaron" => "&#x17E;", "gacute" => "&#x1F5;"};

  my $val = $ncr->{$cer};
  return (defined $val)?$val:"&$cer;";
}
