Last active
March 20, 2023 05:05
-
-
Save mavaddat/67d83986e2a2848b602a0d2efd3103bf to your computer and use it in GitHub Desktop.
A Conversion Tool from DTD to XML Schema
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! perl | |
# | |
# Mavaddat Javid (mavaddat.ca) 2023/03/21 | |
# derived from | |
# | |
# Mary Holstege ([email protected]) 2001/05/15 | |
# derived from | |
# | |
# Yuichi Koike ($Id: dtd2xsd.pl,v 1.2 2001/05/15 13:08:21 mholstege Exp $) | |
# derived from | |
# | |
# Dan Connolly | |
# derived from | |
# | |
# Bert Bos <[email protected]> | |
# Created: 17 Mar 1998 | |
# | |
use strict; | |
use Term::ANSIColor; | |
use LWP::Simple; | |
use LWP::UserAgent; | |
# Handling command line argument | |
my $targetNS = "http://www.w3.org/namespace/"; | |
my $prefix = "t"; | |
my $alias = 0; | |
my $file = ""; | |
my %SimpleTypes; | |
my @AttrGroupPatterns; | |
my @ModelGroupPatterns; | |
my @SubstitutionGroupPatterns; | |
my %SubstitutionGroup; | |
my @ComplexTypePatterns; | |
my %Mixed; | |
my %ModelGroup; | |
my $mapping_file; | |
my $pcdata_flag = 0; | |
my $pcdata_simpletype = "string"; | |
my $debug = 0; | |
while ( $#ARGV >= 0 ) { | |
my $para = shift(@ARGV); | |
if ( $para eq "-ns" ) { | |
$targetNS = shift(@ARGV); | |
} | |
elsif ( $para eq "-prefix" ) { | |
$prefix = shift(@ARGV); | |
} | |
elsif ( $para eq "-alias" ) { | |
$alias = 1; | |
} | |
elsif ( $para eq "-pcdata" ) { | |
# Treat #PCDATA by itself as being string (or other simple type | |
# if so designated in the mapping file) | |
$pcdata_flag = 1; | |
} | |
elsif ( $para eq "-mapfile" ) { | |
$mapping_file = shift(@ARGV); | |
} | |
elsif ( $para eq "-simpletype" ) { | |
my ($pat) = shift(@ARGV); | |
my ($b) = shift(@ARGV); | |
$SimpleTypes{$pat} = $b; | |
} | |
elsif ( $para eq "-attrgroup" ) { | |
push( @AttrGroupPatterns, shift(@ARGV) ); | |
} | |
elsif ( $para eq "-modelgroup" ) { | |
push( @ModelGroupPatterns, shift(@ARGV) ); | |
} | |
elsif ( $para eq "-substgroup" ) { | |
push( @SubstitutionGroupPatterns, shift(@ARGV) ); | |
} | |
elsif ( $para eq "-complextype" ) { | |
push( @ComplexTypePatterns, shift(@ARGV) ); | |
} | |
elsif ( $para eq "-debug" ) { | |
$debug = 1; | |
} | |
elsif ($para eq "-h" | |
|| $para eq "-help" | |
|| $para eq "--help" | |
|| $para eq "-?" | |
|| $para eq "--?" | |
|| $para eq "-usage" | |
|| $para eq "--usage" ) | |
{ | |
print STDERR "dtd2xsd.pl - A Conversion Tool from DTD to XML Schema\n"; | |
print STDERR | |
"Usage: $0 [-ns namespace] [-prefix prefix] [-alias] [-pcdata] [-mapfile mapping_file] [-simpletype pattern base] [-attrgroup pattern] [-modelgroup pattern] [-substgroup pattern] [-complextype pattern] [-debug] file\n"; | |
print STDERR | |
" -ns namespace:\t\tspecify namespace URI\n"; | |
print STDERR " -prefix prefix:\t\tspecify the namespace prefix\n"; | |
print STDERR " -alias:\t\t\tenables special aliases (default off)\n"; | |
print STDERR | |
" -pcdata:\t\t\ttreat #PCDATA by itself as being string (or other simple type if so designated in the mapping file)\n"; | |
print STDERR | |
" -mapfile mapping_file:\tuse mapping_file for mapping patterns\n"; | |
print STDERR " -simpletype pattern base:\tmap pattern to base\n"; | |
print STDERR | |
" -attrgroup pattern:\t\ttreat parameter entities whose name match this pattern as attribute groups\n"; | |
print STDERR " -modelgroup pattern:\t\ttreat parameter entities whose name match this pattern as model groups\n"; | |
print STDERR | |
" -substgroup pattern:\t\tmap pattern to substitution group\n"; | |
print STDERR " -complextype pattern:\t\tmap pattern to complex type\n"; | |
print STDERR " -debug:\t\t\tturn on debugging output\n"; | |
print STDERR " file:\t\t\t\tthe DTD file to be converted\n"; | |
exit(0); | |
} | |
else { | |
$file = $para; | |
} | |
} | |
open( INTERMEDIATE, ">intermediate.out" ); | |
# Alias dictionary: defaults | |
my %alias_dic; | |
$alias_dic{"URI"} = "uriReference"; | |
$alias_dic{"LANG"} = "language"; | |
$alias_dic{"NUMBER"} = "nonNegativeInteger"; | |
$alias_dic{"Date"} = "date"; | |
$alias_dic{"Boolean"} = "boolean"; | |
if ($mapping_file) { | |
print STDERR "Open mapping $mapping_file "; | |
if ( !open( MAPPINGS, "<$mapping_file" ) ) { | |
print STDERR "unsuccessful.\n"; | |
} | |
else { | |
print STDERR "successful.\n"; | |
while (<MAPPINGS>) { | |
chop; | |
if (/^alias\s+([^ \t]+)\s*=\s*([^ \t]+)\s*/i) { | |
$alias_dic{$1} = $2; | |
} | |
elsif (/^simpletype\s+([^ \t]+)\s*=\s*([^ \t]+)\s*/i) { | |
$SimpleTypes{$1} = $2; | |
} | |
elsif (/^attrgroup\s+([^ \t]+)\s*/i) { | |
push( @AttrGroupPatterns, $1 ); | |
} | |
elsif (/^modelgroup\s+([^ \t]+)\s*/i) { | |
push( @ModelGroupPatterns, $1 ); | |
} | |
elsif (/^substgroup\s+([^ \t]+)\s*/i) { | |
push( @SubstitutionGroupPatterns, $1 ); | |
} | |
elsif (/^complextype\s+([^ \t]+)\s*/i) { | |
push( @ComplexTypePatterns, $1 ); | |
} | |
elsif (/^pcdata\s+([^ \t]+)\s*/i) { | |
## BUGLET: doesn't pay attention to prefix; just a special alias | |
$pcdata_simpletype = $1; | |
} | |
} | |
} | |
foreach my $key ( keys(%alias_dic) ) { | |
print STDERR "Alias \%$key to $alias_dic{$key}\n"; | |
} | |
} | |
# Variable declaration | |
my $linelen = 72; | |
my $PROG = substr( $0, rindex( $0, "/" ) + 1 ); | |
my $USAGE = "Usage: $PROG file\n"; | |
my $str = "(?:\"([^\"]*)\"|\'([^\']*)\')"; | |
my %pent; # Parameter entities | |
my %attributes; # Attribute lists | |
my @element; # Elements in source order | |
my %model; # Content models | |
# Main | |
$/ = undef; | |
# Open file, remove comment and include external entity | |
my $buf = openFile($file); | |
open( RAW, ">raw.out" ); | |
print RAW $buf; | |
# Alias treatment | |
my $alias_ident = "_alias_"; | |
if ( $alias eq 1 ) { | |
foreach my $key ( keys(%alias_dic) ) { | |
my $aliaskey = sprintf( "%s%s%s", $alias_ident, $key, $alias_ident ); | |
$buf =~ s/\%$key;/$aliaskey/gsie; | |
} | |
} | |
my %imports; | |
# store external parameter entities | |
while ( $buf =~ s/<!ENTITY\s+%\s+(\S+)\s+PUBLIC\s+$str\s+$str.*?>//sie ) { | |
print STDERR "$1 is $4.$5\n"; | |
$imports{$1} = $4 . $5; | |
} | |
while ( $buf =~ s/<!ENTITY\s+%\s+(\S+)\s+SYSTEM\s+$str.*?>//sie ) { | |
print STDERR "$1 is $2.$3\n"; | |
$imports{$1} = $2 . $3; | |
} | |
foreach my $key ( keys(%imports) ) { | |
$pent{$key} = " <!_IMPORT $imports{$key}> "; | |
} | |
# store all parameter entities | |
while ( $buf =~ s/<!ENTITY\s+%\s+(\S+)\s+$str\s*>//sie ) { | |
my ( $n, $repltext ) = ( $1, $2 . $3 ); | |
my ($pat); | |
next if $pent{$n}; # only the first declaration of an entity counts | |
foreach $pat ( keys %SimpleTypes ) { | |
if ( $n =~ /^$pat$/ ) { | |
$buf .= " <!_DATATYPE $n $SimpleTypes{$pat} $repltext> "; | |
$pent{$n} = "#DATATYPEREF $n"; | |
undef $n; | |
last; | |
} | |
} | |
foreach $pat (@AttrGroupPatterns) { | |
if ( $n =~ /^$pat$/ ) { | |
$buf .= " <!_ATTRGROUP $n $repltext> "; | |
$pent{$n} = "#ATTRGROUPREF $n"; | |
undef $n; | |
last; | |
} | |
} | |
foreach $pat (@ModelGroupPatterns) { | |
if ( $n =~ /^$pat$/ ) { | |
$buf .= " <!_MODELGROUP $n $repltext> "; | |
$pent{$n} = "#MODELGROUPREF $n"; | |
undef $n; | |
last; | |
} | |
} | |
foreach $pat (@SubstitutionGroupPatterns) { | |
if ( $n =~ /^$pat$/ ) { | |
$buf .= " <!_SUBSTGROUP $n $repltext> "; | |
$pent{$n} = "#SUBSTGROUPREF $n"; | |
undef $n; | |
last; | |
} | |
} | |
foreach $pat (@ComplexTypePatterns) { | |
if ( $n =~ /^$pat$/ ) { | |
$buf .= " <!_COMPLEXTYPE $n $repltext> "; | |
$pent{$n} = "#COMPLEXTYPEREF $n"; | |
undef $n; | |
last; | |
} | |
} | |
$pent{$n} = $repltext if $n; | |
} | |
open( MUNGED, ">munged.out" ); | |
print MUNGED $buf; | |
# remove all general entities | |
$buf =~ s/<!ENTITY\s+.*?>//gsie; | |
# loop until parameter entities fully expanded | |
my $i; | |
do { | |
# count # of substitutions | |
$i = 0; | |
# expand parameter entities | |
$buf =~ s/%([a-zA-Z0-9_\.-]+);?/$i++,$pent{$1}/gse; | |
} while ( $i != 0 ); | |
# treat conditional sections | |
# BUG: Doesn't handle nested conditional sections right | |
while ( $buf =~ s/<!\[\s*INCLUDE\s*\[([^\]]*\][^\]][^\]]*|[^\]]*)\]\]>/$1/sie ) | |
{ | |
} | |
while ( $buf =~ s/<!\[\s*IGNORE\s*\[([^\]]*\][^\]][^\]]*|[^\]]*)\]\]>/$1/sie ) { | |
} | |
open( EXPANDED, ">expanded.out" ); | |
print EXPANDED $buf; | |
# store attribute lists | |
$buf =~ s/<!ATTLIST\s+(\S+)\s+(.*?)>/store_att($1, $2)/gsie; | |
# store content models | |
$buf =~ s/<!ELEMENT\s+(\S+)\s+(.+?)>/store_elt($1, $2)/gsie; | |
#print "<?xml version='1.0'?>\n"; | |
print "<schema | |
xmlns='http://www.w3.org/2001/XMLSchema' | |
targetNamespace='$targetNS' | |
xmlns:$prefix='$targetNS'>\n"; | |
# find maximum length of non-terminals | |
#my $maxlen = max(map(length, @element)) + 4; | |
# write imports | |
$buf =~ s/<!_IMPORT\s+(\S+)\s*>/write_import($1)/gsie; | |
# write simple type declarations | |
$buf =~ | |
s/<!_DATATYPE\s+(\S+)\s+(\S+)\s+(.+?)>/write_simpleType($1, $2, $3)/gsie; | |
# write complex type declarations | |
$buf =~ s/<!_COMPLEXTYPE\s+(\S+)\s+(.+?)>/write_complexType($1, $2)/gsie; | |
# write attribute groups | |
$buf =~ s/<!_ATTRGROUP\s+(\S+)\s+(.+?)>/write_attrGroup($1, $2)/gsie; | |
# write model groups | |
$buf =~ s/<!_MODELGROUP\s+(\S+)\s+(.+?)>/write_modelGroup($1, $2)/gsie; | |
# write subsitution groups | |
$buf =~ s/<!_SUBSTGROUP\s+(\S+)\s+(.+?)>/write_substitutionGroup($1, $2)/gsie; | |
my $str2 = "(\"[^\"]*\"|\'[^\']*\')"; | |
# write notation declarations | |
$buf =~ | |
s/<!NOTATION\s+(\S+)\s+PUBLIC\s+$str2\s+$str2\s*>/write_notation($1, $2, $3)/gsie; | |
$buf =~ s/<!NOTATION\s+(\S+)\s+PUBLIC\s+$str2\s*>/write_notation($1, $2)/gsie; | |
print INTERMEDIATE $buf; | |
my ($e); | |
# loop over elements, writing XML schema | |
foreach $e (@element) { | |
my $h = $model{$e}; | |
my $h2 = $attributes{$e}; | |
my @model = @$h; | |
my $isSimple = | |
( $pcdata_flag eq 1 ) | |
&& ( $model[1] eq '#PCDATA' ) | |
&& ( ( $#model eq 2 ) | |
|| ( ( $#model eq 3 ) && ( $model[3] eq '*' ) ) ); | |
my $substGroup = $SubstitutionGroup{$e}; | |
if ($substGroup) { | |
$substGroup = " substitutionGroup='$substGroup'"; | |
} | |
# print rule for element $e | |
if ( $isSimple && !$h2 ) { | |
# Assume (#PCDATA) is string | |
print "\n <element name='$e' type='$pcdata_simpletype'$substGroup>\n"; | |
} | |
else { | |
printf "\n <element name='$e'%s>\n", | |
defined($substGroup) ? $substGroup : ""; | |
} | |
if ($isSimple) { | |
# Assume (#PCDATA) is string | |
if ($h2) { | |
print " <complexType>\n"; | |
print " <simpleContent>\n"; | |
print " <extension base='string'>\n"; | |
} | |
} | |
else { | |
# print rule for $e's content model | |
print " <complexType"; | |
if ( $model[0] eq 'EMPTY' ) { | |
if ( !$h2 ) { | |
print "/>\n"; | |
} | |
else { | |
print ">\n"; | |
} | |
} | |
elsif ( $model[0] eq 'ANY' ) { | |
print ">\n"; | |
print " <sequence>\n"; | |
print " <any namespace='$targetNS'/>\n"; | |
print " </sequence>\n"; | |
} | |
else { | |
if ( $debug eq 1 ) { | |
print STDERR "==mixed? @model\n"; #@@ | |
} | |
if ( &isMixed(@model) ) { | |
print " mixed='true'>\n"; | |
} | |
else { | |
print ">\n"; | |
} | |
my @list = &makeChildList( '', @model ); | |
&printChildList( 3, @list ); | |
} | |
} | |
# print rule for $e's attributes | |
if ( !$h2 ) { | |
# nothing | |
} | |
else { | |
&printAttrDecls(@$h2); | |
if ($isSimple) { | |
print " </extension>\n"; | |
print " </simpleContent>\n"; | |
} | |
} | |
if ( !$h2 && $isSimple ) { | |
# Do nothing | |
} | |
elsif ( $h2 || $model[0] ne 'EMPTY' ) { | |
print " </complexType>\n"; | |
} | |
print " </element>\n"; | |
} | |
print "</schema>\n"; | |
exit; | |
sub printSpace { | |
my ($num) = $_[0]; | |
for ( my $i = 0 ; $i < $num ; $i++ ) { | |
print " "; | |
} | |
} | |
sub printChildList { | |
my ( $num, @list ) = @_; | |
my @currentTag = (); | |
for ( my $i = 0 ; $i <= $#list ; $i++ ) { | |
my $n = $list[$i]; | |
if ( $n eq 0 || $n eq 1 || $n eq 2 || $n eq 3 ) { | |
if ( ( $pcdata_flag eq 0 ) | |
&& ( $n eq 0 || $n eq 1 ) | |
&& $list[ $i + 1 ] eq 20 ) | |
{ | |
# The whole list is 0 20 or 1 20; i.e. (#PCDATA) or (#PCDATA)*. | |
# Don't generate a sequence child; mixed handles all this. | |
} | |
else { | |
if ( | |
( | |
!defined( $currentTag[$#currentTag] ) | |
or $currentTag[$#currentTag] eq "" | |
) | |
&& $n eq 0 | |
) | |
{ | |
push( @currentTag, "" ); | |
} | |
printSpace($num); | |
$num++; | |
print "<sequence"; | |
if ( $n eq 1 ) { | |
print " minOccurs='0' maxOccurs='unbounded'"; | |
} | |
elsif ( $n eq 2 ) { | |
print " maxOccurs='unbounded'"; | |
} | |
elsif ( $n eq 3 ) { | |
print " minOccurs='0' maxOccurs='1'"; | |
} | |
print ">\n"; | |
push( @currentTag, "sequence" ); | |
} | |
} | |
elsif ( $n eq 10 || $n eq 11 || $n eq 12 || $n eq 13 ) { | |
printSpace($num); | |
$num++; | |
print "<choice"; | |
if ( $n eq 11 ) { | |
print " minOccurs='0' maxOccurs='unbounded'"; | |
} | |
elsif ( $n eq 12 ) { | |
print " maxOccurs='unbounded'"; | |
} | |
elsif ( $n eq 13 ) { | |
print " minOccurs='0' maxOccurs='1'"; | |
} | |
print ">\n"; | |
push( @currentTag, "choice" ); | |
} | |
elsif ( $n eq 20 ) { | |
my $tag = pop(@currentTag); | |
if ( defined($tag) and $tag ne "" ) { | |
$num--; | |
printSpace($num); | |
print "</", $tag, ">\n"; | |
} | |
} | |
else { | |
printSpace($num); | |
if ( $n eq '#MODELGROUPREF' ) { | |
print "<group ref="; | |
my $eltname = $list[ ++$i ]; | |
if ( $eltname =~ /:/ ) { | |
print "$eltname'"; | |
} | |
else { | |
print "$prefix:$eltname'"; | |
} | |
} | |
elsif ( $n eq '#COMPLEXTYPEREF' ) { | |
my $eltname = $list[ ++$i ]; | |
if ( $eltname =~ /:/ ) { | |
print "<element name='$eltname' type='$eltname'"; | |
} | |
else { | |
print "<element name='$eltname' type='$prefix:$eltname'"; | |
} | |
} | |
elsif ( $n eq '#SUBSTGROUPREF' ) { | |
my $eltname = $list[ ++$i ]; | |
if ( $eltname =~ /:/ ) { | |
print "<element ref='$eltname'"; | |
} | |
else { | |
print "<element ref='$prefix:$eltname'"; | |
} | |
} | |
else { | |
if ( $n =~ /:/ ) { | |
print "<element ref='$n'"; | |
} | |
else { | |
print "<element ref='$prefix:$n'"; | |
} | |
} | |
if ( $currentTag[$#currentTag] ne "choice" ) { | |
if ( $list[ $i + 1 ] eq "+" ) { | |
print " maxOccurs='unbounded'"; | |
$i++; | |
} | |
elsif ( $list[ $i + 1 ] eq "?" ) { | |
print " minOccurs='0' maxOccurs='1'"; | |
$i++; | |
} | |
elsif ( $list[ $i + 1 ] eq "*" ) { | |
print " minOccurs='0' maxOccurs='unbounded'"; | |
$i++; | |
} | |
} | |
print "/>\n"; | |
} | |
} | |
} | |
sub makeChildList { | |
my ( $groupName, @model ) = @_; | |
print INTERMEDIATE "GROUPNAME=", $groupName, "; MODEL=", @model, "\n"; | |
my @ret = (); | |
my @brace = (); | |
for ( my $i = 0 ; $i <= $#model ; $i++ ) { | |
my $n = $model[$i]; | |
if ( $n eq "(" ) { | |
push( @ret, 0 ); | |
push( @brace, $#ret ); | |
} | |
elsif ( $n eq ")" ) { | |
if ( $i < $#model ) { | |
if ( $model[ $i + 1 ] eq "*" ) { | |
$ret[ $brace[$#brace] ] += 1; | |
$i++; | |
} | |
elsif ( $model[ $i + 1 ] eq "+" ) { | |
$ret[ $brace[$#brace] ] += 2; | |
$i++; | |
} | |
elsif ( $model[ $i + 1 ] eq "?" ) { | |
$ret[ $brace[$#brace] ] += 3; | |
$i++; | |
} | |
} | |
pop(@brace); | |
push( @ret, 20 ); | |
} | |
elsif ( $n eq "," ) { | |
$ret[ $brace[$#brace] ] = 0; | |
} | |
elsif ( $n eq "|" ) { | |
$ret[ $brace[$#brace] ] = 10; | |
} | |
elsif ( $n eq "#PCDATA" ) { | |
if ( $model[ $i + 1 ] eq "|" ) { | |
$i++; | |
} | |
if ($groupName) { | |
$Mixed{$groupName} = 1; | |
} | |
} | |
else { | |
push( @ret, $n ); | |
} | |
} | |
# "( ( a | b | c )* )" gets mapped to "0 10 a b c 20 20" which will generate | |
# a spurious sequence element. This is not too harmful when this is an | |
# element content model, but with model groups it is incorrect. | |
# In general we need to strip off 0 20 from the ends when it is redundant. | |
# Redundant means: there is some other group that bounds the whole list. | |
# Note that it gets a little tricky: | |
# ( (a|b),(c|d) ) gets mapped to "0 10 a b 20 10 c d 20 20". If one | |
# naively chops off the 0 and 20 on the groups that there is a 10 on one | |
# end and a 20 on the other, one loses the bounding sequence, which is | |
# required in this case. | |
# | |
if ( | |
$ret[0] eq 0 | |
&& $ret[$#ret] eq 20 | |
&& $ret[ $#ret - 1 ] eq 20 | |
&& ( $ret[1] eq 0 | |
|| $ret[1] eq 1 | |
|| $ret[1] eq 2 | |
|| $ret[1] eq 3 | |
|| $ret[1] eq 10 | |
|| $ret[1] eq 11 | |
|| $ret[1] eq 12 | |
|| $ret[1] eq 13 ) | |
) | |
{ | |
# OK, it is possible that the 0 20 is redundant. Now scan for balance: | |
# All interim 20 between the proposed new start and the proposed new | |
# final one should be at level 1 or above. | |
my $depth = 0; | |
my $redundant_paren = 1; # Assume redundant until proved otherwise | |
for ( my $i = 1 ; $i <= $#ret - 1 ; $i++ ) { | |
if ( $ret[$i] eq 20 ) { | |
$depth--; | |
if ( $i < $#ret - 1 && $depth < 1 ) { | |
$redundant_paren = 0; | |
# print STDERR "i=$i,depth=$depth\n"; | |
} | |
} | |
elsif ($ret[$i] eq 0 | |
|| $ret[$i] eq 1 | |
|| $ret[$i] eq 2 | |
|| $ret[$i] eq 3 | |
|| $ret[$i] eq 10 | |
|| $ret[$i] eq 11 | |
|| $ret[$i] eq 12 | |
|| $ret[$i] eq 13 ) | |
{ | |
$depth++; | |
} | |
} # for | |
if ( $redundant_paren eq 1 ) { | |
print STDERR "Truncating @ret\n"; | |
@ret = @ret[ 1 .. $#ret - 1 ]; | |
} | |
} | |
if ( $debug eq 1 ) { | |
print STDERR "@model to @ret\n"; | |
} | |
return @ret; | |
} | |
sub printAttrDecls { | |
my @atts = @_; | |
for ( my $i = 0 ; $i <= $#atts ; $i++ ) { | |
if ( $atts[$i] eq '#ATTRGROUPREF' ) { | |
print " <attributeGroup ref='$prefix:$atts[$i+1]'/>\n"; | |
$i++; | |
} | |
else { | |
# attribute name | |
print " <attribute name='$atts[$i]'"; | |
# attribute type | |
my @enume; | |
$i++; | |
if ( $atts[$i] eq "(" ) { | |
# like `attname ( yes | no ) #REQUIRED` | |
$i++; | |
while ( $atts[$i] ne ")" ) { | |
if ( $atts[$i] ne "|" ) { | |
push( @enume, $atts[$i] ); | |
} | |
$i++; | |
} | |
} | |
elsif ( $atts[$i] eq '#DATATYPEREF' ) { | |
print " type='$prefix:$atts[++$i]'"; | |
} | |
elsif ( $atts[$i] eq '#COMPLEXTYPEREF' ) { | |
print " type='$prefix:$atts[++$i]'"; | |
} | |
elsif ( $alias eq 1 && $atts[$i] =~ s/$alias_ident//gsie ) { | |
# alias special | |
print " type='$alias_dic{$atts[$i]}'"; | |
} | |
elsif ( $atts[$i] =~ | |
/ID|IDREF|ENTITY|NOTATION|IDREFS|ENTITIES|NMTOKEN|NMTOKENS/ ) | |
{ | |
# common type for DTD and Schema | |
print " type='$atts[$i]'"; | |
} | |
else { | |
# `attname CDATA #REQUIRED` | |
print " type='string'"; | |
} | |
$i++; | |
# #FIXED | |
if ( $atts[$i] eq "#FIXED" ) { | |
$i++; | |
print " use='fixed' value='$atts[$i]'/>\n"; | |
} | |
else { | |
# minOccurs | |
if ( $atts[$i] eq "#REQUIRED" ) { | |
print " use='required'"; | |
} | |
elsif ( $atts[$i] eq "#IMPLIED" ) { | |
print " use='optional'"; | |
} | |
else { | |
print " use='default' value='$atts[$i]'"; | |
} | |
# enumerate | |
if ( $#enume eq -1 ) { | |
print "/>\n"; | |
} | |
else { | |
print ">\n"; | |
print " <simpleType>\n"; | |
print " <restriction base='string'>\n"; | |
&write_enum(@enume); | |
print " </restriction>\n"; | |
print " </simpleType>\n"; | |
print " </attribute>\n"; | |
} | |
} | |
} | |
} | |
} | |
sub write_enum { | |
my (@enume) = @_; | |
for ( my $j = 0 ; $j <= $#enume ; $j++ ) { | |
print " <enumeration value='$enume[$j]'/>\n"; | |
} | |
} | |
# Parse a string into an array of "words". | |
# Words are whitespace-separated sequences of non-whitespace characters, | |
# or quoted strings ("" or ''), with the quotes removed. | |
# HACK: added () stuff for attlist stuff | |
# Parse words for attribute list | |
sub parsewords { | |
my $line = $_[0]; | |
$line =~ s/(\(|\)|\|)/ $1 /g; | |
my $token; | |
my @words = (); | |
while ( $line ne '' ) { | |
if ( $line =~ /^\s+/ ) { | |
# Skip whitespace | |
} | |
elsif ( $line =~ /^\"((?:[^\"]|\\\")*)\"/ ) { | |
$token = $1; | |
$token =~ s/^://gso; | |
$token =~ s/$prefix://gso; | |
push( @words, $token ); | |
} | |
elsif ( $line =~ /^\'((?:[^\']|\\\')*)\'/ ) { | |
$token = $1; | |
$token =~ s/^://gso; | |
$token =~ s/$prefix://gso; | |
push( @words, $token ); | |
} | |
elsif ( $line =~ /^\S+/ ) { | |
$token = $&; | |
$token =~ s/^://gso; | |
$token =~ s/$prefix://gso; | |
push( @words, $token ); | |
} | |
else { | |
die "Cannot happen\n"; | |
} | |
$line = $'; | |
} | |
return @words; | |
} | |
# Store content model, return empty string | |
sub store_elt { | |
my ( $name, $model ) = @_; | |
$model =~ s/\s+/ /gso; | |
$name =~ s/$prefix://gso; | |
$name =~ s/^[^:]+://gso; ###XYZZY latest | |
print INTERMEDIATE "NAME=", $name, "\n"; | |
print INTERMEDIATE "MODEL=", $model, "\n"; | |
push( @element, $name ); | |
my @words; | |
while ( $model =~ s/^\s*(\(|\)|,|\+|\?|\||[\w:_\.-]+|\#\w+|\*)// ) { | |
push( @words, $1 ); | |
print INTERMEDIATE "WORD=", $1, "\n"; | |
} | |
$model{$name} = [@words]; | |
return ''; | |
} | |
# Store attribute list, return empty string | |
sub store_att { | |
my ( $element, $atts ) = @_; | |
my @words = parsewords($atts); | |
$element =~ s/://gso; | |
$element =~ s/$prefix://gso; | |
$attributes{$element} = [@words]; | |
return ''; | |
} | |
sub write_import { | |
my ($file) = @_; | |
$file =~ s/dtd$/xsd/; | |
print "\n <import namespace='$targetNS' schemaLocation='$file'/>\n"; | |
} # write_import | |
sub write_simpleType { | |
my ( $n, $b, $stuff ) = @_; | |
my @words = parsewords($stuff); | |
print "\n <simpleType name='$n'>\n"; | |
print " <restriction base='$b'>\n"; | |
# print STDERR "\n==stuff:\n$stuff \n\n===\n", join('|', @words); | |
my $i = 0; | |
my @enume; | |
if ( $words[$i] eq "(" ) { | |
$i++; | |
while ( $words[$i] ne ")" ) { | |
if ( $words[$i] ne "|" ) { | |
push( @enume, $words[$i] ); | |
} | |
$i++; | |
} | |
write_enum(@enume); | |
} | |
print " </restriction>\n"; | |
print " </simpleType>\n"; | |
} | |
sub write_complexType { | |
my ( $n, $stuff ) = @_; | |
my @words = parsewords($stuff); | |
print "\n <complexType name='$n'>\n"; | |
print "<!-- $stuff -->\n"; | |
my @list = &makeChildList( $n, '(', @words, ')' ); | |
&printChildList( 3, @list ); | |
print " </complexType>\n"; | |
} # write_complexType | |
sub write_attrGroup { | |
my ( $n, $stuff ) = @_; | |
my @words = parsewords($stuff); | |
print "\n <attributeGroup name='$n'>\n"; | |
# print STDERR "\n==stuff:\n$stuff \n\n===\n", join('|', @words); | |
printAttrDecls(@words); | |
print " </attributeGroup>\n"; | |
} | |
sub write_modelGroup { | |
my ( $n, $stuff ) = @_; | |
my @words = parsewords($stuff); | |
print "\n <group name='$n'>\n"; | |
print "<!-- $stuff -->\n"; | |
my @list = &makeChildList( $n, '(', @words, ')' ); | |
&printChildList( 3, @list ); | |
$ModelGroup{$n} = \@list; | |
print " </group>\n"; | |
} | |
sub write_substitutionGroup { | |
my ( $n, $stuff ) = @_; | |
my @words = parsewords($stuff); | |
print "\n <element name='$n' abstract='true'>\n"; | |
my @list = &makeChildList( $n, '(', @words, ')' ); | |
for ( my $i = 0 ; $i < $#list ; $i++ ) { | |
$SubstitutionGroup{ $list[$i] } = $n; | |
} | |
print " </element>\n"; | |
} | |
sub write_notation { | |
my ( $n, $p, $s ) = @_; | |
# No quotes around $p and $s, we already pulled them in | |
print "\n <notation name='$n' public=$p"; | |
if ($s) { | |
print " system=$s"; | |
} | |
print "/>\n"; | |
} | |
sub isMixed { | |
my (@model) = @_; | |
my $isSimple = | |
( $pcdata_flag eq 1 ) | |
&& ( $model[1] eq '#PCDATA' ) | |
&& ( ( $#model eq 2 ) | |
|| ( ( $#model eq 3 ) && ( $model[3] eq '*' ) ) ); | |
if ( $debug eq 1 ) { | |
print STDERR "++ mixed? @model\n"; #@@ | |
} | |
if ($isSimple) { | |
if ( $debug eq 1 ) { | |
print STDERR "++ no; simple type. @model\n"; #@@ | |
} | |
return 0; | |
} | |
my ($i); | |
for ( $i = 0 ; $i <= $#model ; $i++ ) { | |
if ( $model[$i] eq '#PCDATA' | |
|| ( $model[$i] eq '#MODELGROUPREF' && $Mixed{ $model[ $i + 1 ] } ) | |
|| ( $model[$i] eq '#SUBSTGROUPREF' && $Mixed{ $model[ $i + 1 ] } ) | |
) | |
{ | |
if ( $debug eq 1 ) { | |
print STDERR "++ yes! $i @model\n"; #@@ | |
} | |
return 1; | |
} | |
} | |
if ( $debug eq 1 ) { | |
print STDERR "++ no. @model\n"; #@@ | |
} | |
return 0; | |
} | |
# Return maximum value of an array of numbers | |
sub max { | |
my $max = $_[0]; | |
foreach my $i (@_) { | |
if ( $i > $max ) { $max = $i; } | |
} | |
return $max; | |
} | |
# 1) Open file | |
# 2) Remove comment, processing instructions, and general entities | |
# 3) Include external parameter entities recursively | |
# 4) Return the contents of opened file | |
sub openFile { | |
my $file = $_[0]; | |
my %extent; | |
my $bufbuf; | |
if ( $file ne "" ) { | |
print STDERR "open $file "; | |
if ( !open AAA, $file ) { | |
print STDERR " failed!!\n"; | |
return ""; | |
} | |
print STDERR " successful\n"; | |
$bufbuf = <AAA>; | |
} | |
else { | |
print STDERR "open STDIN successful\n"; | |
$bufbuf = <>; | |
} | |
# Strip newlines | |
$bufbuf =~ s/\n//gso; | |
# remove comments | |
$bufbuf =~ s/<!--.*?-->//gso; | |
# remove processing instructions | |
$bufbuf =~ s/<\?.*?>//gso; | |
# store external parameter entities | |
while ( $bufbuf =~ s/<!ENTITY\s+%\s+(\S+)\s+PUBLIC\s+$str\s+$str.*?>//sie ) | |
{ | |
print STDERR "$1 is $4.$5\n"; | |
$extent{$1} = $4 . $5; | |
} | |
while ( $bufbuf =~ s/<!ENTITY\s+%\s+(\S+)\s+SYSTEM\s+$str.*?>//sie ) { | |
print STDERR "$1 is $2.$3\n"; | |
$extent{$1} = $2 . $3; | |
} | |
# read external entity files | |
foreach my $key ( keys(%extent) ) { | |
my $entrep; | |
# check if file is a URL | |
if ( $extent{$key} =~ /^http:\/\// ) { | |
# download the file into memory using LWP | |
my $ua = LWP::UserAgent->new; | |
my $response = $ua->get( $extent{$key} ); | |
if ( $response->is_success ) { | |
$entrep = $response->content; | |
} | |
else { | |
print STDERR "Error: " | |
. $response->status_line | |
. " for $extent{$key} in $file at line $.\n"; | |
$extent{$key} = ""; | |
} | |
} | |
else { | |
$entrep = openFile( $extent{$key} ); | |
} | |
$bufbuf =~ s/%$key;/${entrep}/gsie; | |
} | |
return $bufbuf; | |
} | |
# Changes: 2023/03/14 mj | |
# Add usage message | |
# Recursively load remote DTDs using LWP | |
# Changes: 2002/06/25 mh | |
# Apply fix from Andreus Leue for recursive ext. entity files | |
# Changes: 2001/05/15 mh | |
# Changed to namespace of rec. | |
# Changes: 2001/01/10 mh | |
# Switch to CR syntax | |
# Support external mapping file for type aliases, simple types, model and | |
# attribute groups | |
# Map ANY correctly to wildcard rather than element 'ANY' | |
# Support treating lead PCDATA as string or other aliased simple type instead | |
# of as mixed content (may be more appropriate for data-oriented DTDs) | |
# e.g. <!ELEMENT title (#PCDATA)> => <element name="title" type="string"/> | |
# Support subsitution groups. | |
# | |
# 2001/01/12 mh | |
# Support NOTATION declarations | |
# Fix handling of nested conditional sections | |
# Attempt to compensate for DTDs that are already constructed for namespaces | |
# BUG: Easy to get this wrong | |
## TODO: Change import of external DTDs into schema import | |
## remove xmlns 'attributes' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment