# ParserNonXS.pm: parse texinfo code into a tree. # # Copyright 2010-2024 Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, # or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # Original author: Patrice Dumas # Parts (also from Patrice Dumas) come from texi2html.pl or texi2html.init. # Since there are different parser implementation, XS and NonXS, it is # better to have the Texinfo::Parser packages define only the parser # API functions. Constants, functions useful in both parsers, and other # functions useful in other codes are better defined in other Texinfo # modules. # The organization of the file is the following: # default parser state. With explanation of the internal structures. # determination of command types. # user visible subroutines and subroutines related to input. # internal subroutines, doing the parsing. # In general, the Parser works with character strings decoded from the # command line, from input files or from the parsed document. There are # exceptions for the following files and directory names that are binary # strings: # * the input file name passed through parse_texi_file is a binary string # * @include file name and CPP line directive file names are encoded # into binary strings. # Those binary strings are in 'file_name' keys, they transit through # $self->{'input'} and end up in 'source_info' in tree elements and # in error messages. # # The following parser information is directly determined from the # input file name as binary strings # ->{'global_info'}->{'input_file_name'} # ->{'global_info'}->{'input_directory'} package Texinfo::Parser; # We need the unicode stuff. use 5.006; use strict; # stop \s from matching non-ASCII spaces, etc. \p{...} can still be # used to match Unicode character classes. use if $] >= 5.014, re => '/a'; # check that autovivification do not happen incorrectly. #no autovivification qw(fetch delete exists store strict); # debug use Carp qw(cluck confess); #use Data::Dumper; # to detect if an encoding may be used to open the files # to encode/decode in-memory strings used as files use Encode qw(find_encoding decode encode); # for fileparse use File::Basename; # Clone could be faster for small structures, which should be the case # here, but Clone is not in Perl core modules, so we use Storable::dclone. use Storable qw(dclone); # standard in 5.007003 # commands definitions use Texinfo::Commands; use Texinfo::Common; # Error reporting and counting use Texinfo::Report; # fpr tree copy use Texinfo::ManipulateTree; # To register the parsed manual and associated information # and also to call set_labels_identifiers_target. use Texinfo::Document; # in error messages, and for macro body expansion use Texinfo::Convert::Texinfo; # to normalize names use Texinfo::Convert::NodeNameNormalization; # to complete indices translations. use Texinfo::Translations; require Exporter; our $module_loaded = 0; sub import { if (!$module_loaded) { Texinfo::XSLoader::override ("Texinfo::Parser::_parse_texi_regex", "Texinfo::MiscXS::parse_texi_regex"); Texinfo::XSLoader::override ("Texinfo::Parser::_parse_command_name", "Texinfo::MiscXS::parse_command_name"); $module_loaded = 1; } # The usual import method goto &Exporter::import; } our $VERSION = '7.2'; # Document information set in the parser. The initialization is done by # Texinfo::Document::new_document and afterwards the Texinfo::Document # document is available in the 'document' key in the parser and # document hash keys are directly accessed in the parser for efficiency #'commands_info' => {}, # keys are @-commands names (without @) and # values are arrays for global multiple # @-commands and a value for non multiple # global @-commands. #'listoffloats_list' => {}, # key is the normalized float type, value is # an array reference holding all the floats # of that type. #'identifiers_target' => {}, # keys are normalized label names, as described # in the `HTML Xref' node. Value should be # a node/anchor or float in the tree. #'internal_references' => [], # list of elements source of cross-references, # commands like @ref without books or external # manual files, and menu entries without # external manual. #'labels_list' => [], # array of elements associated with labels. # information on document #'global_info' => {'input_encoding_name' => 'utf-8', # 'included_files' => [],}, # indices a structure holding the link between index # names and merged indices; # initial value is %index_names in Texinfo::Commands. # these are the default values for the parsing state of a document. # Some could become configurable if moved to Texinfo::Common # %parser_document_state_configuration, # but they are not configurable/implemented in the XS parser, so they are # best left internal. Could be relevant to reuse for diverse sources # of input associated to the same document. my %parser_document_state_initialization = ( # parsed document parsing information still relevant after parsing 'aliases' => {}, # key is a command name value is the alias 'macros' => {}, # the key is the user-defined macro name. The # value is the reference on a macro element # as obtained by parsing the @macro 'definfoenclose' => {}, # key is the command name, value is an array # reference with 2 values, beginning and ending. # parsing information still relevant at the end of the parsing 'clickstyle' => 'arrow', # 'kbdinputstyle' => 'distinct', # 'source_mark_counters' => {}, # 'current_node' => undef, # last seen node. 'current_section' => undef, # last seen section. 'current_part' => undef, # last seen part. 'internal_space_holder' => undef, # the element associated with the last internal spaces element added. # We know that there can only be one at a time as a non space # character should always lead to abort_empty_line or another # function being called and the internal space element being # removed or put in the internal_space_holder info. # NOTE internal_space_holder is already unset in abort_empty_line # if the internal space element is put in the internal_space_holder. # It would be cleaner to unset internal_space_holder in all the # cases where the internal space element is removed too, such that # when internal_space_holder is set the previous value is undef and not # the previous internal_space_holder, which is now irrelevant as # its associated space has disappeared. 'sections_level_modifier' => 0, # modified by raise/lowersections 'input_file_encoding' => 'utf-8', # encoding name used for the input file ); my %parsing_state_initialization = ( # parsing information only relevant during an input source parsing 'input' => [], # a stack, with last at bottom. Holds the opened files # or text. Pending macro expansion or text expansion # is also in that structure. 'conditional_stack' => [], # a stack of conditional commands that are # expanded. 'macro_block_stack' => [], # a stack of *macro block commands that are nested. 'macro_expansion_nr' => 0, # number of macros being expanded 'value_expansion_nr' => 0, # number of values being expanded 'nesting_context' => { # key is the context name, value is the # depth of the context. 'basic_inline_stack' => [], 'basic_inline_stack_on_line' => [], 'basic_inline_stack_block' => [], 'regions_stack' => [], 'footnote' => 0, 'caption' => 0, }, 'context_stack' => [], # stack of the contexts, more recent on top. # 'ct_line' is added when on a line or # block @-command line, # 'ct_def' is added instead if on a definition line. # 'ct_preformatted' is added in block commands # where there is no paragraphs and spaces are kept # (format, example, display and menu commands...) # 'ct_math' is added in math block commands # (displaymath) and @math brace commands # 'ct_rawpreformatted' is added in raw block commands # (html, xml, docbook...) # 'ct_inlineraw' is added when in inlineraw # 'ct_base' is (re-)added when in footnote, # caption, or shortcaption (context brace_commands # that does not already start another context, ie not # math). # 'ct_paragraph' is added in paragraph. 'context_command_stack' => [], # the stack of @-commands. An @-command name can # be added each time a context is pushed on # 'context_stack'. Could be undef if there # is no @-command associated with the context. ); my %parser_state_initialization = (%parser_document_state_initialization, %parsing_state_initialization); # other possible keys for the parser state initialized based # on customization variables: # parsing information still relevant at the end of the parsing # line_commands the same as %line_commands, but with index entry # commands dynamically added. # brace_commands the same as %brace_commands, but with definfoenclose # commands dynamically added. # valid_nestings direct command valid nesting information, with # index entry commands dynamically added. # no_paragraph_commands the same as %no_paragraph_commands, # with new index entry commands dynamically added. # basic_inline_commands the same as %contain_basic_inline_commands below, but # with new index entry commands dynamically added # command_index associate a command name with an index name. # index_entry_commands index entry commands, including added index commands. # parser keys related to customization # expanded_formats_hash each key comes from EXPANDED_FORMATS, value is 1 # set points to the value set when initializing, for # configuration items that are not to be overriden # by @-commands. For example documentlanguage. # conf Customization and document state configuration # based on defaults and parser argument. # other keys for the parser state initialized at parser creation # registrar # Texinfo::Report object used for error # # reporting. # A source information is an hash reference with the keys: # line_nr the line number. # file_name the file name, a binary string. # macro if in a macro expansion, the name of the macro. # The input structure is an array, the first is the most recently included # file. The last element may correspond to a file if the parsing is done # on a file, with parse_texi_file, or hold pending text, if called on text. # each element of the array is a hash reference. # # The keys are: # for both text and file: # source_info source information corresponding to the current file. # input_source_mark source mark associated with the input (include file, # macro or value expansion). # # for text: # th handle for text given in input or expansion text # of value or macro. # value_flag set if the text corresponds to a @value command # expansion. # macro_name set if the text corresponds to a new macro expansion. # # for a file: # fh filehandle for the file. # input_file_path file path. # The commands in initialization_overrides are not set in the document if # set at the parser initialization. my %initialization_overrides = ( 'documentlanguage' => 1, ); my %nobrace_commands = %Texinfo::Commands::nobrace_commands; my %line_commands = %Texinfo::Commands::line_commands; my %brace_commands = %Texinfo::Commands::brace_commands; my %commands_args_number = %Texinfo::Commands::commands_args_number; my %accent_commands = %Texinfo::Commands::accent_commands; my %contain_plain_text_commands = %Texinfo::Commands::contain_plain_text_commands; my %contain_basic_inline_commands = %Texinfo::Commands::contain_basic_inline_commands; my %block_commands = %Texinfo::Commands::block_commands; my %blockitem_commands = %Texinfo::Commands::blockitem_commands; my %close_paragraph_commands = %Texinfo::Commands::close_paragraph_commands; my %def_commands = %Texinfo::Commands::def_commands; my %def_alias_commands = %Texinfo::Commands::def_alias_commands; my %preformatted_commands = %Texinfo::Commands::preformatted_commands; my %math_commands = %Texinfo::Commands::math_commands; my %deprecated_commands = %Texinfo::Commands::deprecated_commands; my %root_commands = %Texinfo::Commands::root_commands; my %sectioning_heading_commands = %Texinfo::Commands::sectioning_heading_commands; my %ref_commands = %Texinfo::Commands::ref_commands; my %heading_spec_commands = %Texinfo::Commands::heading_spec_commands; my %in_heading_spec_commands = %Texinfo::Commands::in_heading_spec_commands; my %variadic_commands = %Texinfo::Commands::variadic_commands; my %default_index_commands = %Texinfo::Commands::default_index_commands; my %global_multiple_commands = %Texinfo::Commands::global_commands; my %global_unique_commands = %Texinfo::Commands::global_unique_commands; my %in_index_commands = %Texinfo::Commands::in_index_commands; my %explained_commands = %Texinfo::Commands::explained_commands; my %inline_format_commands = %Texinfo::Commands::inline_format_commands; my %index_entry_command_commands = %Texinfo::Commands::index_entry_command_commands; my %def_map = %Texinfo::Common::def_map; my %def_aliases = %Texinfo::Common::def_aliases; my %all_commands = %Texinfo::Common::all_commands; my %encoding_name_conversion_map = %Texinfo::Common::encoding_name_conversion_map; # Keys are commmands, values are names of indices. User-defined # index commands are added dynamically. my %command_index; $command_index{'vtable'} = 'vr'; $command_index{'ftable'} = 'fn'; foreach my $index_command (keys(%default_index_commands)) { $command_index{$index_command} = $default_index_commands{$index_command}; } # the type of index, fn: function, vr: variable, tp: type my %index_type_def = ( 'fn' => ['deffn', 'deftypefn', 'deftypeop', 'defop'], 'vr' => ['defvr', 'deftypevr', 'defcv', 'deftypecv' ], 'tp' => ['deftp'] ); foreach my $index_type (keys %index_type_def) { foreach my $def (@{$index_type_def{$index_type}}) { $command_index{$def} = $index_type; } } foreach my $def_command(keys %def_map) { if (ref($def_map{$def_command}) eq 'HASH') { my ($real_command) = keys (%{$def_map{$def_command}}); $command_index{$def_command} = $command_index{$real_command}; } $command_index{$def_command.'x'} = $command_index{$def_command}; } # equivalence between a @set flag and an @@-command my %set_flag_command_equivalent = ( 'txicodequoteundirected' => 'codequoteundirected', 'txicodequotebacktick' => 'codequotebacktick', # 'txideftypefnnl' => 'deftypefnnewline', ); # could be moved to Texinfo::Common if needed more generally # same order as in XS parser my @set_flag_index_char_ignore = ( ['txiindexbackslashignore', '\\'], ['txiindexhyphenignore', '-'], ['txiindexlessthanignore', '<'], ['txiindexatsignignore', '@'], ); # after checking that the context is in begin_paragraph_contexts, the list # of types in which paragraphs are not started. my %type_without_paragraph; foreach my $type ('brace_arg', 'brace_container') { $type_without_paragraph{$type} = 1; }; # To keep in sync with XS main/element_types.txt leading_space flag my %leading_space_types; foreach my $type ('empty_line', 'ignorable_spaces_after_command', 'internal_spaces_after_command', 'internal_spaces_before_argument', 'internal_spaces_before_context_argument', 'spaces_after_close_brace') { $leading_space_types{$type} = 1; } my %command_ignore_space_after; foreach my $command ('anchor', 'hyphenation', 'caption', 'shortcaption', 'sortas', 'seeentry', 'seealso') { $command_ignore_space_after{$command} = 1; } # @-commands that should be at the beginning of a line my %begin_line_commands; foreach my $command ('node', 'end') { $begin_line_commands{$command} = $command; } foreach my $begin_line_command (keys(%line_commands)) { $begin_line_commands{$begin_line_command} = 1; } foreach my $not_begin_line_command ('comment', 'c', 'columnfractions', 'item', 'subentry') { delete $begin_line_commands{$not_begin_line_command}; } # default indices my %index_names = %Texinfo::Commands::index_names; # @-commands that do not start a paragraph my %no_paragraph_commands = %Texinfo::Commands::no_paragraph_commands; # does not include index commands my %close_preformatted_commands = %close_paragraph_commands; my %close_paragraph_not_preformatted = ('sp' => 1); foreach my $no_close_preformatted(keys(%close_paragraph_not_preformatted)) { delete $close_preformatted_commands{$no_close_preformatted}; } foreach my $block_command (keys(%block_commands)) { $begin_line_commands{$block_command} = 1; } # commands that may appear in commands containing plain text only my %in_plain_text_commands = %accent_commands; foreach my $brace_command(keys(%brace_commands)) { $in_plain_text_commands{$brace_command} = 1 if ($brace_commands{$brace_command} eq 'noarg'); } my %symbol_nobrace_commands; foreach my $no_brace_command (keys(%nobrace_commands)) { if ($nobrace_commands{$no_brace_command} eq 'symbol' and !$in_heading_spec_commands{$no_brace_command}) { $symbol_nobrace_commands{$no_brace_command} = 1; $in_plain_text_commands{$no_brace_command} = 1; } } $in_plain_text_commands{'c'} = 1; $in_plain_text_commands{'comment'} = 1; # commands that may appear in any text argument, similar constraints # as in paragraphs. my %in_full_text_commands; # start from all the brace commands foreach my $command (keys(%brace_commands), keys(%symbol_nobrace_commands)) { $in_full_text_commands{$command} = 1; } # selected line and nobrace commands foreach my $in_full_text_command ('c', 'comment', 'refill', 'subentry', 'columnfractions', 'set', 'clear', 'end') { $in_full_text_commands{$in_full_text_command} = 1; } # selected block commands foreach my $block_command (keys(%block_commands)) { $in_full_text_commands{$block_command} = 1 if ($block_commands{$block_command} eq 'conditional' or $block_commands{$block_command} eq 'format_raw'); } # sort out brace commmands and setup command list appearing in more # restricted context. # those two commands are not allowed in any command except for @float */ delete $in_full_text_commands{'caption'}; delete $in_full_text_commands{'shortcaption'}; # commands that accept full text, but no block or top-level commands my %contain_full_text_commands; foreach my $brace_command (keys (%brace_commands)) { next if (exists($contain_plain_text_commands{$brace_command})); if ($brace_commands{$brace_command} eq 'style_code' or $brace_commands{$brace_command} eq 'style_other' or $brace_commands{$brace_command} eq 'style_no_code') { $contain_full_text_commands{$brace_command} = 1; } } foreach my $line_command ('center', 'exdent', 'item', 'itemx', 'nodedescription') { $contain_full_text_commands{$line_command} = 1; } # Fill the valid nestings hash. The keys are the containing commands and # the values arrays of commands that are allowed to occur inside those # commands. All commands not in this hash are considered to accept anything. # There are additional context tests, to make sure, for instance that we are # testing @-commands on the block, line or node @-command line and not # in the content. my %default_valid_nestings; foreach my $command (keys(%contain_plain_text_commands)) { $default_valid_nestings{$command} = \%in_plain_text_commands; } foreach my $command (keys(%contain_full_text_commands)) { $default_valid_nestings{$command} = \%in_full_text_commands; } # @this* commands should not appear in any line command except for # page heading specification commands and can also appear in brace @-commands, # on heading specification commands lines, such as indicatric @-commands. foreach my $brace_command (keys (%brace_commands)) { if ($brace_commands{$brace_command} eq 'style_code' or $brace_commands{$brace_command} eq 'style_other' or $brace_commands{$brace_command} eq 'style_no_code') { # duplicate hash to avoid modifying shared structure $default_valid_nestings{$brace_command} = { %{$default_valid_nestings{$brace_command}} }; foreach my $in_heading_spec (keys(%in_heading_spec_commands)) { $default_valid_nestings{$brace_command}->{$in_heading_spec} = 1; } } } # For _check_valid_nesting_context my %in_basic_inline_commands = %in_full_text_commands; foreach my $not_in_basic_inline_commands ('xref', 'ref', 'pxref', 'inforef', 'titlefont', 'anchor', 'footnote', 'verb') { delete $in_basic_inline_commands{$not_in_basic_inline_commands}; } foreach my $in_heading_spec (keys(%in_heading_spec_commands)) { $in_basic_inline_commands{$in_heading_spec} = 1; } my %contain_basic_inline_with_refs_commands = (%sectioning_heading_commands, %def_commands); my %ok_in_basic_inline_with_refs_commands; foreach my $permitted_command ('xref', 'ref', 'pxref', 'inforef') { $ok_in_basic_inline_with_refs_commands{$permitted_command} = 1; } my %not_in_region_commands; foreach my $block_command (keys(%block_commands)) { $not_in_region_commands{$block_command} = 1 if ($block_commands{$block_command} eq 'region'); } # index names that cannot be set by the user. my %forbidden_index_name = (); foreach my $name (keys(%index_names)) { $forbidden_index_name{$name} = 1; if ($name =~ /^(.).$/) { $forbidden_index_name{$1} = 1; } } foreach my $other_forbidden_index_name ('info','ps','pdf','htm', 'html', 'log','aux','dvi','texi','txi','texinfo','tex','bib') { $forbidden_index_name{$other_forbidden_index_name} = 1; } my %canonical_texinfo_encodings; # Valid encodings as described in the Texinfo manual foreach my $canonical_encoding ('us-ascii', 'utf-8', 'iso-8859-1', 'iso-8859-15', 'iso-8859-2', 'koi8-r', 'koi8-u') { $canonical_texinfo_encodings{$canonical_encoding} = 1; } my %begin_paragraph_contexts; foreach my $begin_paragraph_context ('base') { $begin_paragraph_contexts{'ct_'.$begin_paragraph_context} = 1; }