From 2f2a298292106adcbd7d28793d99de6581ef5dd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Fri, 6 Dec 2024 12:05:54 -0300 Subject: [PATCH 1/9] XML Entities --- configure.php | 11 ++++++++++- manual.xml.in | 6 +++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/configure.php b/configure.php index 6338f1b4d..c3a2f3ce8 100755 --- a/configure.php +++ b/configure.php @@ -736,8 +736,17 @@ function getFileModificationHistory(): array { globbetyglob("{$ac['basedir']}/scripts", 'make_scripts_executable'); -$redir = ($ac['quiet'] == 'yes') ? ' > ' . (is_windows() ? 'nul' : '/dev/null') : ''; +{ + $cmd[] = escapeshellarg( $ac['PHP'] ); + $cmd[] = escapeshellarg( __DIR__ . '/scripts/entities.php' ); + $cmd[] = escapeshellarg( $ac['ROOTDIR'] . '/en/entities' ); + if ( $ac['LANG'] != 'en' ) + $cmd[] = escapeshellarg( $ac['ROOTDIR'] . '/' . $ac['LANG'] . '/entities' ); + $cmd = implode( ' ' , $cmd ); + passthru( $cmd ); +} +$redir = ($ac['quiet'] == 'yes') ? ' > ' . (is_windows() ? 'nul' : '/dev/null') : ''; quietechorun("\"{$ac['PHP']}\" -q \"{$ac['basedir']}/scripts/file-entities.php\"{$redir}"); diff --git a/manual.xml.in b/manual.xml.in index 6c807c65b..70751e15e 100644 --- a/manual.xml.in +++ b/manual.xml.in @@ -11,6 +11,11 @@ %language-snippets; @TRANSLATION_ONLY_INCL_END@ + + +%manual-entities; + + @@ -57,7 +62,6 @@ &install.cloud.index; &install.fpm.index; &install.pecl; - &install.composer; &install.ini; From dc8c67ffcca2c4c38efccdb064b4fb3d140b03bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Fri, 6 Dec 2024 12:06:45 -0300 Subject: [PATCH 2/9] XML Entities --- scripts/dtdent-conv.php | 84 +++++++++ scripts/dtdent-split.php | 96 ++++++++++ scripts/entities.php | 390 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 570 insertions(+) create mode 100644 scripts/dtdent-conv.php create mode 100644 scripts/dtdent-split.php create mode 100644 scripts/entities.php diff --git a/scripts/dtdent-conv.php b/scripts/dtdent-conv.php new file mode 100644 index 000000000..2b1dec206 --- /dev/null +++ b/scripts/dtdent-conv.php @@ -0,0 +1,84 @@ + | ++----------------------------------------------------------------------+ +| Description: Convert old style .ent into new style .ent XML bundle. | ++----------------------------------------------------------------------+ + +See `entities.php` source for detailed rationale. + +Use this for converting bundled entities files that use into +XML version used by `entities.php`. + +After converting, add the generated entities in an global.ent or +manual.ent file, and delete the previous one. + +After all old style .ent files are split or converted, this script can +be removed. */ + +ini_set( 'display_errors' , 1 ); +ini_set( 'display_startup_errors' , 1 ); +error_reporting( E_ALL ); + +if ( count( $argv ) < 2 ) + die(" Syntax: php $argv[0] infile\n" ); + +$infile = $argv[1]; + +$content = file_get_contents( $infile ); + +$pos1 = 0; +while ( true ) +{ + $pos1 = strpos( $content , " DOMNodeList (ampunstand intended) + + $name = trim( $name ); + $text = str_replace( "&" , "&" , $text ); + + $frag = "\n"; + $frag .= " $text\n"; + $frag .= ''; + + $dom = new DOMDocument( '1.0' , 'utf8' ); + $dom->recover = true; + $dom->resolveExternals = false; + libxml_use_internal_errors( true ); + + $dom->loadXML( $frag , LIBXML_NSCLEAN ); + $dom->normalizeDocument(); + + libxml_clear_errors(); + + $text = $dom->saveXML( $dom->getElementsByTagName( "entity" )[0] ); + $text = str_replace( "&" , "&" , $text ); + + echo "$text\n"; +} diff --git a/scripts/dtdent-split.php b/scripts/dtdent-split.php new file mode 100644 index 000000000..168e5aa89 --- /dev/null +++ b/scripts/dtdent-split.php @@ -0,0 +1,96 @@ + | ++----------------------------------------------------------------------+ +| Description: Split old style .ent file into individual files. | ++----------------------------------------------------------------------+ + +See `entities.php` source for detailed rationale. + +Use this for spliting `language-snippets-ent` or other "big" entities +files into individual .xml files. + +After spliting, add the new directory entities/ with they contents, +and remove `language-snippets-ent`, in one go. + +After all old style .ent files are split or converted, this script can +be removed. */ + +ini_set( 'display_errors' , 1 ); +ini_set( 'display_startup_errors' , 1 ); +error_reporting( E_ALL ); + +if ( count( $argv ) < 4 ) + die(" Syntax: php $argv[0] infile outdir [hash user]\n" ); + +$infile = $argv[1]; +$outdir = $argv[2]; +$hash = $argv[3] ?? ""; +$user = $argv[4] ?? "_"; + +$content = file_get_contents( $infile ); +$entities = []; + +// Parse + +$pos1 = 0; +while ( true ) +{ + $pos1 = strpos( $content , " $text ) +{ + $file = "$outdir/$name.xml"; + if ( file_exists( $file ) ) + exit( "Name colision: $file\n" ); +} + +// Write + +foreach( $entities as $name => $text ) +{ + $file = "$outdir/$name.xml"; + + $header = '' . "\n"; + + if ( $hash != "" ) + $header .= "\n"; + + file_put_contents( $file , $header . $text ); +} + +$total = count( $entities ); +print "Generated $total files.\n"; diff --git a/scripts/entities.php b/scripts/entities.php new file mode 100644 index 000000000..3305f6a8d --- /dev/null +++ b/scripts/entities.php @@ -0,0 +1,390 @@ + | ++----------------------------------------------------------------------+ +| Description: Collect individual entities into an .entities.ent file. | ++----------------------------------------------------------------------+ + +# Mental model, or things that I would liked to know 20 years prior + +XML Entity processing has more in common with DOMDocumentFragment than +DOMElement. In other words, simple text and multi rooted XML files +are valid contents, whereas they are not valid XML documents. + +Also, namespaces do not automatically "cross" between a parent +document and their includes, even if they are included in the same +file, as local textual entities. s are, for all intended +purposes, separated documents, with separated namespaces and have +*expected* different default namespaces. + +So each one of, possibly multiple, "root" XML elements inside an +fragment need to be annotated with default namespace, even if the +"root" element occurs surrounded by text. For example: + +- "texttext", need one namespace, or it is invalid, and; +- "", need TWO namespaces, or it is also invalid. + +# Output + +This script collects bundled and individual entity files (detailed +below), at some expected relative paths, and generates an +.entities.ent file, in a sibling position to manual.xml.in. + +The output .entities.ent file has no duplications, so collection +order is important to keep the necessary operational semantics. Here, +newer loaded entities takes priority (overwrites) over previous one. +Note that this is the reverse of convention, where +duplicated entity names are ignored. The priority order used here +is important to allow detecting cases where "constant" entities +are being overwriten, or if translatable entities are missing +translations. + +# Individual tracked entities, or `.xml` files at `entities/` + +As explained above, the individual entity contents are not really +valid XML *documents*, they are only at most valid XML *fragments*. + +Yet, individual entities are stored in entities/ as .xml files, for +two reasons: first, text editors in general can highlights XML syntax, +and second, this allows normal revision tracking on then, without +requiring weird changes on `revcheck.php`. + +# Bundled entities files, group tracked + +For very small textual entities, down to simple text words or single +tag elements, that may never change, individual entity tracking is +an overkill. This script also loads bundled entities files, at +some expected locations, with specific semantics. + +These bundle files are really normal XML files, correctly annotated +with XML namespaces used on manual, so any individual exported entity +have corret XML namespace annotations. These bundle entity files +are revcheck tracked normaly, but are not included in manual.xml.in, +as they only participate in general entity loading, described above. + +- global.ent - expected untranslated +- manual.ent - expected translated +- lang/entities/* - expected translated + +*/ + +ini_set( 'display_errors' , 1 ); +ini_set( 'display_startup_errors' , 1 ); +error_reporting( E_ALL ); + +const PARTIAL_IMPL = true; // For while spliting and bundle convertion are incomplete + +if ( count( $argv ) < 2 || in_array( '--help' , $argv ) || in_array( '-h' , $argv ) ) +{ + fwrite( STDERR , "\nUsage: {$argv[0]} [--debug] entitiesDir [entitiesDir]\n\n" ); + return; +} + +$filename = Entities::rotateOutputFile(); + +$langs = []; +$normal = true; // configure.php mode +$debug = false; // detailed output + +for( $idx = 1 ; $idx < count( $argv ) ; $idx++ ) + if ( $argv[$idx] == "--debug" ) + { + $normal = false; + $debug = true; + } + else + $langs[] = $argv[$idx]; + +if ( $normal ) + print "Creating .entities.ent..."; +else + print "Creating .entities.ent in debug mode.\n"; + +loadEnt( __DIR__ . "/../global.ent" , global: true , warnMissing: true ); +foreach( $langs as $lang ) +{ + loadEnt( __DIR__ . "/../../$lang/global.ent" , global: true ); + loadEnt( __DIR__ . "/../../$lang/manual.ent" , translate: true , warnMissing: true ); + loadEnt( __DIR__ . "/../../$lang/remove.ent" , remove: true ); + loadDir( $langs , $lang ); +} + +Entities::writeOutputFile(); +Entities::checkReplaces( $debug ); + +echo " done: " , Entities::$countTotalGenerated , " entities"; +if ( Entities::$countUnstranslated > 0 ) + echo ", " , Entities::$countUnstranslated , " untranslated"; +if ( Entities::$countConstantReplaced > 0 ) + echo ", " , Entities::$countConstantReplaced , " global replaced"; +if ( Entities::$countRemoveReplaced > 0 ) + echo ", " , Entities::$countRemoveReplaced , " to be removed"; +echo ".\n"; + +exit; + +class EntityData +{ + public function __construct( + public string $path , + public string $name , + public string $text ) {} +} + +class Entities +{ + public static int $countConstantReplaced = 0; + public static int $countUnstranslated = 0; + public static int $countRemoveReplaced = 0; + public static int $countTotalGenerated = 0; + + private static string $filename = __DIR__ . "/../.entities.ent"; // sibling of .manual.xml + + private static array $entities = []; // All entities, overwriten + private static array $global = []; // Entities from global.ent files + private static array $replace = []; // Entities expected replaced / translated + private static array $remove = []; // Entities expected removed + private static array $count = []; // Name / Count + private static array $slow = []; // External entities, slowless, overwrite + + static function put( string $path , string $name , string $text , bool $global = false , bool $replace = false , bool $remove = false ) + { + $entity = new EntityData( $path , $name , $text ); + Entities::$entities[ $name ] = $entity; + + if ( $global ) + Entities::$global[ $name ] = $name; + + if ( $replace ) + Entities::$replace[ $name ] = $name; + + if ( $remove ) + Entities::$remove[ $name ] = $name; + + if ( ! isset( Entities::$count[$name] ) ) + Entities::$count[$name] = 1; + else + Entities::$count[$name]++; + } + + static function slow( string $path ) + { + if ( isset( $slow[$path] ) ) + fwrite( STDERR , "Unexpected physical file ovewrite: $path\n" ); + $slow[ $path ] = $path; + } + + static function rotateOutputFile() + { + if ( file_exists( Entities::$filename ) ) + unlink( Entities::$filename ); + touch( Entities::$filename ); + + Entities::$filename = realpath( Entities::$filename ); // only full paths on XML + } + + static function writeOutputFile() + { + saveEntitiesFile( Entities::$filename , Entities::$entities ); + } + + static function checkReplaces( bool $debug ) + { + Entities::$countTotalGenerated = count( Entities::$entities ); + Entities::$countConstantReplaced = 0; + Entities::$countUnstranslated = 0; + Entities::$countRemoveReplaced = 0; + + foreach( Entities::$entities as $name => $text ) + { + $replaced = Entities::$count[$name] - 1; + $expectedConstant = in_array( $name , Entities::$global ); + $expectedReplaced = in_array( $name , Entities::$replace ); + $expectedRemoved = in_array( $name , Entities::$remove ); + + if ( $expectedConstant && $replaced != 0 ) + { + Entities::$countConstantReplaced++; + if ( $debug ) + print "Expected global, replaced $replaced times:\t$name\n"; + } + + if ( $expectedReplaced && $replaced != 1 ) + { + Entities::$countUnstranslated++; + if ( $debug ) + print "Expected translated, replaced $replaced times:\t$name\n"; + } + + if ( $expectedRemoved && $replaced != 0 ) + { + Entities::$countRemoveReplaced++; + if ( $debug ) + print "Expected removed, replaced $replaced times:\t$name\n"; + } + } + } +} + +function loadEnt( string $path , bool $global = false , bool $translate = false , bool $remove = false , bool $warnMissing = false ) +{ + $absolute = realpath( $path ); + if ( $absolute === false ) + if ( PARTIAL_IMPL ) + return; + else + if ( $warnMissing ) + fwrite( STDERR , "\n Missing entity file: $path\n" ); + $path = $absolute; + + $text = file_get_contents( $path ); + $text = str_replace( "&" , "&" , $text ); + + $dom = new DOMDocument( '1.0' , 'utf8' ); + if ( ! $dom->loadXML( $text ) ) + die( "XML load failed for $path\n" ); + + $xpath = new DOMXPath( $dom ); + $list = $xpath->query( "/*/*" ); + + foreach( $list as $ent ) + { + // weird, namespace correting, DOMNodeList -> DOMDocumentFragment + $other = new DOMDocument( '1.0' , 'utf8' ); + + foreach( $ent->childNodes as $node ) + $other->appendChild( $other->importNode( $node , true ) ); + + $name = $ent->getAttribute( "name" ); + $text = $other->saveXML(); + + $text = str_replace( "&" , "&" , $text ); + $text = rtrim( $text , "\n" ); + $lines = explode( "\n" , $text ); + array_shift( $lines ); // remove XML declaration + $text = implode( "\n" , $lines ); + + Entities::put( $path , $name , $text , $global , $translate , $remove ); + } +} + +function loadDir( array $langs , string $lang ) +{ + global $debug; + + $dir = __DIR__ . "/../../$lang/entities"; + $dir = realpath( $dir ); + if ( $dir === false || ! is_dir( $dir ) ) + if ( PARTIAL_IMPL ) + { + if ( $debug ) + print "Not a directory: $dir\n"; + return; + } + else + exit( "Not directory: $dir\n" ); + + $files = scandir( $dir ); + $expectedReplaced = array_search( $lang , $langs ) > 0; + + foreach( $files as $file ) + { + $path = realpath( "$dir/$file" ); + + if ( is_dir( $path ) ) + continue; + if ( str_starts_with( $file , '.' ) ) + continue; + + $text = file_get_contents( $path ); + $text = rtrim( $text , "\n" ); + + loadXml( $path , $text , $expectedReplaced ); + } +} + +function loadXml( string $path , string $text , bool $expectedReplaced ) +{ + if ( trim( $text ) == "" ) + { + fwrite( STDERR , "\n Empty entity (should it be in remove.ent?): '$path' \n" ); + Entities::put( $pat , $text , remove: true ); + return; + } + + $info = pathinfo( $path ); + $name = $info["filename"]; + + $frag = "$text"; + + $dom = new DOMDocument( '1.0' , 'utf8' ); + $dom->recover = true; + $dom->resolveExternals = false; + libxml_use_internal_errors( true ); + + $res = $dom->loadXML( $frag ); + + $err = libxml_get_errors(); + libxml_clear_errors(); + + foreach( $err as $item ) + { + $msg = trim( $item->message ); + if ( str_starts_with( $msg , "Entity '" ) && str_ends_with( $msg , "' not defined" ) ) + continue; + + fwrite( STDERR , "\n XML load failed on entity file." ); + fwrite( STDERR , "\n Path: $path" ); + fwrite( STDERR , "\n Error: $msg\n" ); + return; + } + + Entities::put( $path , $name , $text , replace: $expectedReplaced ); +} + +function saveEntitiesFile( string $filename , array $entities ) +{ + $tmpDir = __DIR__ . "/entities"; + + $file = fopen( $filename , "w" ); + fputs( $file , "\n\n\n" ); + + foreach( $entities as $name => $entity ) + { + $text = $entity->text; + $quote = ""; + + // If the text contains mixed quoting, keeping it + // as an external file to avoid (re)quotation hell. + + if ( strpos( $text , "'" ) === false ) + $quote = "'"; + if ( strpos( $text , '"' ) === false ) + $quote = '"'; + + if ( $quote == "" ) + { + if ( $entity->path == "" ) + { + $entity->path = $tmpDir . "/{$entity->path}.tmp"; + file_put_contents( $entity->path , $text ); + } + fputs( $file , "path}'>\n\n" ); + Entities::slow( $entity->path ); + } + else + fputs( $file , "\n\n" ); + } + + fclose( $file ); +} From 31fa4b20d69a7afcf6e2b35e1dac15c1f1d8c7d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Fri, 6 Dec 2024 13:19:06 -0300 Subject: [PATCH 3/9] Fixes on conv/split tools --- scripts/dtdent-conv.php | 8 +++---- scripts/dtdent-split.php | 50 ++++++++++++++++++++++++++++++---------- 2 files changed, 42 insertions(+), 16 deletions(-) diff --git a/scripts/dtdent-conv.php b/scripts/dtdent-conv.php index 2b1dec206..777a2cb2d 100644 --- a/scripts/dtdent-conv.php +++ b/scripts/dtdent-conv.php @@ -12,10 +12,10 @@ +----------------------------------------------------------------------+ | Authors: André L F S Bacci | +----------------------------------------------------------------------+ -| Description: Convert old style .ent into new style .ent XML bundle. | +| Description: Convert DTD Entities files into XML Entities files. | +----------------------------------------------------------------------+ -See `entities.php` source for detailed rationale. +See `entities.php` for detailed rationale. Use this for converting bundled entities files that use into XML version used by `entities.php`. @@ -58,7 +58,7 @@ $name = substr( $content , $pos1 , $pos2 - $pos1 - 1 ); $text = substr( $content , $pos2 , $pos3 - $pos2 ); - // weird &ugly; ass, namespace corret, DOMDocumentFragment -> DOMNodeList (ampunstand intended) + // weird &ugly; ass, namespace correct, DOMDocumentFragment -> DOMNodeList (ampunstand intended) $name = trim( $name ); $text = str_replace( "&" , "&" , $text ); @@ -80,5 +80,5 @@ $text = $dom->saveXML( $dom->getElementsByTagName( "entity" )[0] ); $text = str_replace( "&" , "&" , $text ); - echo "$text\n"; + echo "\n$text\n"; } diff --git a/scripts/dtdent-split.php b/scripts/dtdent-split.php index 168e5aa89..d5d684c44 100644 --- a/scripts/dtdent-split.php +++ b/scripts/dtdent-split.php @@ -12,26 +12,26 @@ +----------------------------------------------------------------------+ | Authors: André L F S Bacci | +----------------------------------------------------------------------+ -| Description: Split old style .ent file into individual files. | +| Description: Split old DTD .ent file into individual XML files. | +----------------------------------------------------------------------+ -See `entities.php` source for detailed rationale. +See `entities.php` for detailed rationale. -Use this for spliting `language-snippets-ent` or other "big" entities -files into individual .xml files. +Use this for spliting `language-snippets-ent` and possible other DTD +entities files into individual .xml files. -After spliting, add the new directory entities/ with they contents, -and remove `language-snippets-ent`, in one go. +After spliting, add generated files under doc-lang/entities/ , and +the original file, in one go. -After all old style .ent files are split or converted, this script can +After all DTD .ent files are split or converted, this script can be removed. */ ini_set( 'display_errors' , 1 ); ini_set( 'display_startup_errors' , 1 ); error_reporting( E_ALL ); -if ( count( $argv ) < 4 ) - die(" Syntax: php $argv[0] infile outdir [hash user]\n" ); +if ( count( $argv ) < 3 ) + die(" Syntax: php $argv[0] infile outdir [hash user]\n" ); $infile = $argv[1]; $outdir = $argv[2]; @@ -75,7 +75,7 @@ { $file = "$outdir/$name.xml"; if ( file_exists( $file ) ) - exit( "Name colision: $file\n" ); + echo( "Entity name colision, OVERWROTE: $file\n" ); } // Write @@ -83,8 +83,7 @@ foreach( $entities as $name => $text ) { $file = "$outdir/$name.xml"; - - $header = '' . "\n"; + $header = ""; if ( $hash != "" ) $header .= "\n"; @@ -92,5 +91,32 @@ file_put_contents( $file , $header . $text ); } +// Test + +$dom = new DOMDocument(); +$dom->recover = true; +$dom->resolveExternals = false; +libxml_use_internal_errors( true ); + +foreach( $entities as $name => $text ) +{ + $file = "$outdir/$name.xml"; + + $text = file_get_contents( $file ); + $text = "$text"; + + $dom->loadXML( $text ); + $err = libxml_get_errors(); + libxml_clear_errors(); + + foreach( $err as $e ) + { + $msg = trim( $e->message ); + if ( str_starts_with( $msg , "Entity '" ) && str_ends_with( $msg , "' not defined" ) ) + continue; + die( "Failed to load $file\n" ); + } +} + $total = count( $entities ); print "Generated $total files.\n"; From 43b9e9e02b947442ea2b2833c59a44e728fdf62a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Fri, 6 Dec 2024 14:35:12 -0300 Subject: [PATCH 4/9] Example (dist) files, idempotent build and other fixes --- entities/global.ent-dist | 19 +++++++ entities/manual.ent-dist | 14 +++++ entities/remove.ent-dist | 20 +++++++ scripts/entities.php | 116 ++++++++++++++++++++------------------- 4 files changed, 112 insertions(+), 57 deletions(-) create mode 100644 entities/global.ent-dist create mode 100644 entities/manual.ent-dist create mode 100644 entities/remove.ent-dist diff --git a/entities/global.ent-dist b/entities/global.ent-dist new file mode 100644 index 000000000..1d5c90fe8 --- /dev/null +++ b/entities/global.ent-dist @@ -0,0 +1,19 @@ + + + + + + + \ No newline at end of file diff --git a/entities/manual.ent-dist b/entities/manual.ent-dist new file mode 100644 index 000000000..62ca58550 --- /dev/null +++ b/entities/manual.ent-dist @@ -0,0 +1,14 @@ + + + + + + + + \ No newline at end of file diff --git a/entities/remove.ent-dist b/entities/remove.ent-dist new file mode 100644 index 000000000..6bf8988ad --- /dev/null +++ b/entities/remove.ent-dist @@ -0,0 +1,20 @@ + + + + + + + \ No newline at end of file diff --git a/scripts/entities.php b/scripts/entities.php index 3305f6a8d..de090dcae 100644 --- a/scripts/entities.php +++ b/scripts/entities.php @@ -17,12 +17,12 @@ # Mental model, or things that I would liked to know 20 years prior -XML Entity processing has more in common with DOMDocumentFragment than +DTD Entity processing has more in common with DOMDocumentFragment than DOMElement. In other words, simple text and multi rooted XML files are valid contents, whereas they are not valid XML documents. Also, namespaces do not automatically "cross" between a parent -document and their includes, even if they are included in the same +document and their entities, even if they are included in the same file, as local textual entities. s are, for all intended purposes, separated documents, with separated namespaces and have *expected* different default namespaces. @@ -36,11 +36,11 @@ # Output -This script collects bundled and individual entity files (detailed +This script collects grouped and individual entity files (detailed below), at some expected relative paths, and generates an .entities.ent file, in a sibling position to manual.xml.in. -The output .entities.ent file has no duplications, so collection +The output file .entities.ent has no duplications, so collection order is important to keep the necessary operational semantics. Here, newer loaded entities takes priority (overwrites) over previous one. Note that this is the reverse of convention, where @@ -49,52 +49,56 @@ are being overwriten, or if translatable entities are missing translations. -# Individual tracked entities, or `.xml` files at `entities/` +# Individual XML Entities, or `.xml` files at `entities/` As explained above, the individual entity contents are not really valid XML *documents*, they are only at most valid XML *fragments*. Yet, individual entities are stored in entities/ as .xml files, for two reasons: first, text editors in general can highlights XML syntax, -and second, this allows normal revision tracking on then, without -requiring weird changes on `revcheck.php`. +even for XML fragments, and second, this allows normal revision tracking +per file, without requiring weird changes on `revcheck.php`. Note that +is *invalid* to place XML declaration in these fragment files, at least +in files that are invalid XML documents (on multi node rooted ones). -# Bundled entities files, group tracked +# Grouped entities files, file tracked For very small textual entities, down to simple text words or single tag elements, that may never change, individual entity tracking is -an overkill. This script also loads bundled entities files, at +an overkill. This script also loads grouped XML Entities files, at some expected locations, with specific semantics. -These bundle files are really normal XML files, correctly annotated +These grouped files are really normal XML files, correctly annotated with XML namespaces used on manual, so any individual exported entity -have corret XML namespace annotations. These bundle entity files -are revcheck tracked normaly, but are not included in manual.xml.in, -as they only participate in general entity loading, described above. +have correct anc clean XML namespace annotations. These grouped entity +files are tracked normally by revcheck, but are not directly included +in manual.xml.in, as they only participate in general entity loading, +described above. -- global.ent - expected untranslated -- manual.ent - expected translated -- lang/entities/* - expected translated +- global.ent - expected unreplaced +- manual.ent - expected replaced (translated) +- remove.ent - expected unused +- lang/entities/* - expected replaced (translated) */ +const PARTIAL_IMPL = true; // For while XML Entities are not fully implanted in all languages + ini_set( 'display_errors' , 1 ); ini_set( 'display_startup_errors' , 1 ); error_reporting( E_ALL ); -const PARTIAL_IMPL = true; // For while spliting and bundle convertion are incomplete - if ( count( $argv ) < 2 || in_array( '--help' , $argv ) || in_array( '-h' , $argv ) ) { - fwrite( STDERR , "\nUsage: {$argv[0]} [--debug] entitiesDir [entitiesDir]\n\n" ); + fwrite( STDERR , "\nUsage: {$argv[0]} [--debug] langCode [langCode]\n\n" ); return; } -$filename = Entities::rotateOutputFile(); +$filename = Entities::rotateOutputFile(); // idempotent $langs = []; -$normal = true; // configure.php mode -$debug = false; // detailed output +$normal = true; // Normal configure.php mode +$debug = false; // Detailed console mode for( $idx = 1 ; $idx < count( $argv ) ; $idx++ ) if ( $argv[$idx] == "--debug" ) @@ -125,10 +129,10 @@ echo " done: " , Entities::$countTotalGenerated , " entities"; if ( Entities::$countUnstranslated > 0 ) echo ", " , Entities::$countUnstranslated , " untranslated"; -if ( Entities::$countConstantReplaced > 0 ) - echo ", " , Entities::$countConstantReplaced , " global replaced"; -if ( Entities::$countRemoveReplaced > 0 ) - echo ", " , Entities::$countRemoveReplaced , " to be removed"; +if ( Entities::$countReplacedGlobal > 0 ) + echo ", " , Entities::$countReplacedGlobal , " global replaced"; +if ( Entities::$countReplacedRemove > 0 ) + echo ", " , Entities::$countReplacedRemove , " remove replaced"; echo ".\n"; exit; @@ -143,19 +147,19 @@ public function __construct( class Entities { - public static int $countConstantReplaced = 0; public static int $countUnstranslated = 0; - public static int $countRemoveReplaced = 0; + public static int $countReplacedGlobal = 0; + public static int $countReplacedRemove = 0; public static int $countTotalGenerated = 0; - private static string $filename = __DIR__ . "/../.entities.ent"; // sibling of .manual.xml + private static string $filename = __DIR__ . "/../temp/entities.ent"; // idempotent private static array $entities = []; // All entities, overwriten - private static array $global = []; // Entities from global.ent files + private static array $global = []; // Entities expected not replaced private static array $replace = []; // Entities expected replaced / translated - private static array $remove = []; // Entities expected removed + private static array $remove = []; // Entities expected not replaced and not used private static array $count = []; // Name / Count - private static array $slow = []; // External entities, slowless, overwrite + private static array $slow = []; // External entities, slow, uncontroled overwrite static function put( string $path , string $name , string $text , bool $global = false , bool $replace = false , bool $remove = false ) { @@ -189,7 +193,6 @@ static function rotateOutputFile() if ( file_exists( Entities::$filename ) ) unlink( Entities::$filename ); touch( Entities::$filename ); - Entities::$filename = realpath( Entities::$filename ); // only full paths on XML } @@ -201,36 +204,36 @@ static function writeOutputFile() static function checkReplaces( bool $debug ) { Entities::$countTotalGenerated = count( Entities::$entities ); - Entities::$countConstantReplaced = 0; Entities::$countUnstranslated = 0; - Entities::$countRemoveReplaced = 0; + Entities::$countReplacedGlobal = 0; + Entities::$countReplacedRemove = 0; foreach( Entities::$entities as $name => $text ) { $replaced = Entities::$count[$name] - 1; - $expectedConstant = in_array( $name , Entities::$global ); + $expectedGlobal = in_array( $name , Entities::$global ); $expectedReplaced = in_array( $name , Entities::$replace ); $expectedRemoved = in_array( $name , Entities::$remove ); - if ( $expectedConstant && $replaced != 0 ) + if ( $expectedGlobal && $replaced != 0 ) { - Entities::$countConstantReplaced++; + Entities::$countReplacedGlobal++; if ( $debug ) - print "Expected global, replaced $replaced times:\t$name\n"; + print "Expected global, replaced $replaced times: $name\n"; } if ( $expectedReplaced && $replaced != 1 ) { Entities::$countUnstranslated++; if ( $debug ) - print "Expected translated, replaced $replaced times:\t$name\n"; + print "Expected translated, replaced $replaced times: $name\n"; } if ( $expectedRemoved && $replaced != 0 ) { - Entities::$countRemoveReplaced++; + Entities::$countReplacedRemove++; if ( $debug ) - print "Expected removed, replaced $replaced times:\t$name\n"; + print "Expected removed, replaced $replaced times: $name\n"; } } } @@ -238,14 +241,14 @@ static function checkReplaces( bool $debug ) function loadEnt( string $path , bool $global = false , bool $translate = false , bool $remove = false , bool $warnMissing = false ) { - $absolute = realpath( $path ); - if ( $absolute === false ) + $realpath = realpath( $path ); + if ( $realpath === false ) if ( PARTIAL_IMPL ) return; else if ( $warnMissing ) fwrite( STDERR , "\n Missing entity file: $path\n" ); - $path = $absolute; + $path = $realpath; $text = file_get_contents( $path ); $text = str_replace( "&" , "&" , $text ); @@ -259,7 +262,7 @@ function loadEnt( string $path , bool $global = false , bool $translate = false foreach( $list as $ent ) { - // weird, namespace correting, DOMNodeList -> DOMDocumentFragment + // weird, namespace correting, DOMNodeList -> DOMDocumentFragment transform $other = new DOMDocument( '1.0' , 'utf8' ); foreach( $ent->childNodes as $node ) @@ -268,8 +271,8 @@ function loadEnt( string $path , bool $global = false , bool $translate = false $name = $ent->getAttribute( "name" ); $text = $other->saveXML(); - $text = str_replace( "&" , "&" , $text ); $text = rtrim( $text , "\n" ); + $text = str_replace( "&" , "&" , $text ); $lines = explode( "\n" , $text ); array_shift( $lines ); // remove XML declaration $text = implode( "\n" , $lines ); @@ -292,7 +295,7 @@ function loadDir( array $langs , string $lang ) return; } else - exit( "Not directory: $dir\n" ); + exit( "Error: not a directory: $dir\n" ); $files = scandir( $dir ); $expectedReplaced = array_search( $lang , $langs ) > 0; @@ -301,10 +304,10 @@ function loadDir( array $langs , string $lang ) { $path = realpath( "$dir/$file" ); - if ( is_dir( $path ) ) - continue; if ( str_starts_with( $file , '.' ) ) continue; + if ( is_dir( $path ) ) + continue; $text = file_get_contents( $path ); $text = rtrim( $text , "\n" ); @@ -315,18 +318,17 @@ function loadDir( array $langs , string $lang ) function loadXml( string $path , string $text , bool $expectedReplaced ) { + $info = pathinfo( $path ); + $name = $info["filename"]; + $frag = "$text"; + if ( trim( $text ) == "" ) { fwrite( STDERR , "\n Empty entity (should it be in remove.ent?): '$path' \n" ); - Entities::put( $pat , $text , remove: true ); + Entities::put( $path , $name , $text ); return; } - $info = pathinfo( $path ); - $name = $info["filename"]; - - $frag = "$text"; - $dom = new DOMDocument( '1.0' , 'utf8' ); $dom->recover = true; $dom->resolveExternals = false; @@ -354,7 +356,7 @@ function loadXml( string $path , string $text , bool $expectedReplaced ) function saveEntitiesFile( string $filename , array $entities ) { - $tmpDir = __DIR__ . "/entities"; + $tmpDir = __DIR__ . "/temp"; // idempotent $file = fopen( $filename , "w" ); fputs( $file , "\n\n\n" ); From 8a318b5406cdfa08bbeaf3a9bd776431abe65bc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Fri, 6 Dec 2024 15:24:10 -0300 Subject: [PATCH 5/9] Idempotence and opt-in / partial implantation --- configure.php | 15 ++++++++++----- manual.xml.in | 2 +- scripts/entities.php | 3 ++- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/configure.php b/configure.php index c3a2f3ce8..d2ae74560 100755 --- a/configure.php +++ b/configure.php @@ -737,13 +737,18 @@ function getFileModificationHistory(): array { globbetyglob("{$ac['basedir']}/scripts", 'make_scripts_executable'); { - $cmd[] = escapeshellarg( $ac['PHP'] ); - $cmd[] = escapeshellarg( __DIR__ . '/scripts/entities.php' ); - $cmd[] = escapeshellarg( $ac['ROOTDIR'] . '/en/entities' ); + $cmd[] = $ac['PHP']; + $cmd[] = __DIR__ . '/scripts/entities.php'; if ( $ac['LANG'] != 'en' ) - $cmd[] = escapeshellarg( $ac['ROOTDIR'] . '/' . $ac['LANG'] . '/entities' ); + $cmd[] = 'en'; + $cmd[] = $ac['LANG']; + foreach( $cmd as & $esc ) + $esc = escapeshellarg( $esc ); $cmd = implode( ' ' , $cmd ); - passthru( $cmd ); + $ret = 0; + passthru( $cmd , $ret ); + if ( $ret != 0 ) + errors_are_bad( $ret ); } $redir = ($ac['quiet'] == 'yes') ? ' > ' . (is_windows() ? 'nul' : '/dev/null') : ''; diff --git a/manual.xml.in b/manual.xml.in index 70751e15e..6544a786a 100644 --- a/manual.xml.in +++ b/manual.xml.in @@ -12,7 +12,7 @@ @TRANSLATION_ONLY_INCL_END@ - + %manual-entities; diff --git a/scripts/entities.php b/scripts/entities.php index de090dcae..08544cbd6 100644 --- a/scripts/entities.php +++ b/scripts/entities.php @@ -324,7 +324,8 @@ function loadXml( string $path , string $text , bool $expectedReplaced ) if ( trim( $text ) == "" ) { - fwrite( STDERR , "\n Empty entity (should it be in remove.ent?): '$path' \n" ); + if ( ! PARTIAL_IMPL ) + fwrite( STDERR , "\n Empty entity (should it be in remove.ent?): '$path' \n" ); Entities::put( $path , $name , $text ); return; } From 84e445cb5d61a8102ad42fbaf18acfb22595ac01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Mon, 9 Dec 2024 09:01:18 -0300 Subject: [PATCH 6/9] Improve comment texts --- scripts/entities.php | 49 ++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/scripts/entities.php b/scripts/entities.php index 08544cbd6..d9402bb66 100644 --- a/scripts/entities.php +++ b/scripts/entities.php @@ -36,41 +36,42 @@ # Output -This script collects grouped and individual entity files (detailed -below), at some expected relative paths, and generates an -.entities.ent file, in a sibling position to manual.xml.in. - -The output file .entities.ent has no duplications, so collection -order is important to keep the necessary operational semantics. Here, -newer loaded entities takes priority (overwrites) over previous one. -Note that this is the reverse of convention, where -duplicated entity names are ignored. The priority order used here -is important to allow detecting cases where "constant" entities -are being overwriten, or if translatable entities are missing -translations. +This script collects grouped and individual XML Entity files +(detailed below), at some expected relative paths, and generates an +doc-base/temp/entities.ent file with their respective DTD Entities. + +The output file has no duplications, so collection order is important +to keep the necessary operational semantics. Here, latter loaded entities +takes priority (overrides) an previous defined one. Note that this is the +reverse of DTD convention, where duplicated entity names are +ignored. The priority order used here is important to allow detecting +cases where global entities are being overwritten, or if expected +translatable entities are missing translations. # Individual XML Entities, or `.xml` files at `entities/` As explained above, the individual entity contents are not really valid XML *documents*, they are only at most valid XML *fragments*. +More technically, these XML files are really well-balanced texts, per +https://www.w3.org/TR/xml-fragment/#defn-well-balanced . Yet, individual entities are stored in entities/ as .xml files, for -two reasons: first, text editors in general can highlights XML syntax, -even for XML fragments, and second, this allows normal revision tracking +two reasons: first, text editors in general can highlights XML syntax in +well-balanced texts; and second, this allows normal revision tracking per file, without requiring weird changes on `revcheck.php`. Note that is *invalid* to place XML declaration in these fragment files, at least -in files that are invalid XML documents (on multi node rooted ones). +in files that are invalid XML documents (on multi-node rooted ones). # Grouped entities files, file tracked For very small textual entities, down to simple text words or single -tag elements, that may never change, individual entity tracking is +tag elements that may never change, individual entity tracking is an overkill. This script also loads grouped XML Entities files, at some expected locations, with specific semantics. These grouped files are really normal XML files, correctly annotated -with XML namespaces used on manual, so any individual exported entity -have correct anc clean XML namespace annotations. These grouped entity +with XML namespaces used on manuals, so any individual exported entity +has correct and clean XML namespace annotations. These grouped entity files are tracked normally by revcheck, but are not directly included in manual.xml.in, as they only participate in general entity loading, described above. @@ -98,17 +99,15 @@ $langs = []; $normal = true; // Normal configure.php mode -$debug = false; // Detailed console mode for( $idx = 1 ; $idx < count( $argv ) ; $idx++ ) if ( $argv[$idx] == "--debug" ) - { $normal = false; - $debug = true; - } else $langs[] = $argv[$idx]; +$debug = ! $normal; + if ( $normal ) print "Creating .entities.ent..."; else @@ -154,12 +153,12 @@ class Entities private static string $filename = __DIR__ . "/../temp/entities.ent"; // idempotent - private static array $entities = []; // All entities, overwriten + private static array $entities = []; // All entities, bi duplications private static array $global = []; // Entities expected not replaced private static array $replace = []; // Entities expected replaced / translated private static array $remove = []; // Entities expected not replaced and not used private static array $count = []; // Name / Count - private static array $slow = []; // External entities, slow, uncontroled overwrite + private static array $slow = []; // External entities, slow, uncontrolled file overwrites static function put( string $path , string $name , string $text , bool $global = false , bool $replace = false , bool $remove = false ) { @@ -184,7 +183,7 @@ static function put( string $path , string $name , string $text , bool $global = static function slow( string $path ) { if ( isset( $slow[$path] ) ) - fwrite( STDERR , "Unexpected physical file ovewrite: $path\n" ); + fwrite( STDERR , "Unexpected file overwrite: $path\n" ); $slow[ $path ] = $path; } From d2fee0a5fcb7301c3553e86c5da91608a59a6ca5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Mon, 9 Dec 2024 10:52:34 -0300 Subject: [PATCH 7/9] Detect duplicated entity names on first language --- scripts/entities.php | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/scripts/entities.php b/scripts/entities.php index d9402bb66..3e9fcb86b 100644 --- a/scripts/entities.php +++ b/scripts/entities.php @@ -98,20 +98,21 @@ $filename = Entities::rotateOutputFile(); // idempotent $langs = []; -$normal = true; // Normal configure.php mode +$normal = true; +$debug = false; for( $idx = 1 ; $idx < count( $argv ) ; $idx++ ) if ( $argv[$idx] == "--debug" ) $normal = false; else $langs[] = $argv[$idx]; - $debug = ! $normal; if ( $normal ) print "Creating .entities.ent..."; else print "Creating .entities.ent in debug mode.\n"; +$debug = ! $normal; loadEnt( __DIR__ . "/../global.ent" , global: true , warnMissing: true ); foreach( $langs as $lang ) @@ -120,6 +121,7 @@ loadEnt( __DIR__ . "/../../$lang/manual.ent" , translate: true , warnMissing: true ); loadEnt( __DIR__ . "/../../$lang/remove.ent" , remove: true ); loadDir( $langs , $lang ); + Entities::$debugUnique = false; } Entities::writeOutputFile(); @@ -132,6 +134,8 @@ echo ", " , Entities::$countReplacedGlobal , " global replaced"; if ( Entities::$countReplacedRemove > 0 ) echo ", " , Entities::$countReplacedRemove , " remove replaced"; +if ( Entities::$countDuplicated > 0 ) + echo ", " , Entities::$countDuplicated , " duplicated (first language)"; echo ".\n"; exit; @@ -146,20 +150,24 @@ public function __construct( class Entities { - public static int $countUnstranslated = 0; - public static int $countReplacedGlobal = 0; - public static int $countReplacedRemove = 0; - public static int $countTotalGenerated = 0; - private static string $filename = __DIR__ . "/../temp/entities.ent"; // idempotent private static array $entities = []; // All entities, bi duplications private static array $global = []; // Entities expected not replaced private static array $replace = []; // Entities expected replaced / translated private static array $remove = []; // Entities expected not replaced and not used + private static array $unique = []; // For detecting duplicated global+en entities private static array $count = []; // Name / Count private static array $slow = []; // External entities, slow, uncontrolled file overwrites + public static bool $debugUnique = true; // Start on unique mode, disable on second language + + public static int $countUnstranslated = 0; + public static int $countReplacedGlobal = 0; + public static int $countReplacedRemove = 0; + public static int $countTotalGenerated = 0; + public static int $countDuplicated = 0; + static function put( string $path , string $name , string $text , bool $global = false , bool $replace = false , bool $remove = false ) { $entity = new EntityData( $path , $name , $text ); @@ -174,10 +182,22 @@ static function put( string $path , string $name , string $text , bool $global = if ( $remove ) Entities::$remove[ $name ] = $name; - if ( ! isset( Entities::$count[$name] ) ) + if ( ! isset( Entities::$count[ $name ] ) ) Entities::$count[$name] = 1; else Entities::$count[$name]++; + + if ( Entities::$debugUnique ) + { + if ( isset( Entities::$unique[ $name ] ) ) + { + Entities::$countDuplicated++; + if ( Entities::$countDuplicated == 1 ) + fwrite( STDERR , "\n" ); + fwrite( STDERR , "\n Duplicated entity: $name\n" ); + } + Entities::$unique[ $name ] = $entity; + } } static function slow( string $path ) From 6c023e7dafce249004ffe450918c07a22dd37f4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Mon, 9 Dec 2024 11:04:16 -0300 Subject: [PATCH 8/9] Align output --- scripts/entities.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/entities.php b/scripts/entities.php index 3e9fcb86b..2e927c8cd 100644 --- a/scripts/entities.php +++ b/scripts/entities.php @@ -193,8 +193,8 @@ static function put( string $path , string $name , string $text , bool $global = { Entities::$countDuplicated++; if ( Entities::$countDuplicated == 1 ) - fwrite( STDERR , "\n" ); - fwrite( STDERR , "\n Duplicated entity: $name\n" ); + fwrite( STDERR , "\n\n" ); + fwrite( STDERR , " Duplicated entity: $name\n" ); } Entities::$unique[ $name ] = $entity; } From 57d7a87ab3eca5db07900113a4074b2916752a50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Mon, 9 Dec 2024 11:16:32 -0300 Subject: [PATCH 9/9] Reserve space for revision on original files --- scripts/dtdent-split.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/dtdent-split.php b/scripts/dtdent-split.php index d5d684c44..d23863b31 100644 --- a/scripts/dtdent-split.php +++ b/scripts/dtdent-split.php @@ -83,9 +83,10 @@ foreach( $entities as $name => $text ) { $file = "$outdir/$name.xml"; - $header = ""; - if ( $hash != "" ) + if ( $hash == "" ) + $header = ''; + else $header .= "\n"; file_put_contents( $file , $header . $text );