# ParserNonXS.pm: parse texinfo code into a tree.
#
# Copyright 2010-2024 Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License,
# or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#
# Original author: Patrice Dumas
# Parts (also from Patrice Dumas) come from texi2html.pl or texi2html.init.
# Since there are different parser implementation, XS and NonXS, it is
# better to have the Texinfo::Parser packages define only the parser
# API functions. Constants, functions useful in both parsers, and other
# functions useful in other codes are better defined in other Texinfo
# modules.
# The organization of the file is the following:
# default parser state. With explanation of the internal structures.
# determination of command types.
# user visible subroutines and subroutines related to input.
# internal subroutines, doing the parsing.
# In general, the Parser works with character strings decoded from the
# command line, from input files or from the parsed document. There are
# exceptions for the following files and directory names that are binary
# strings:
# * the input file name passed through parse_texi_file is a binary string
# * @include file name and CPP line directive file names are encoded
# into binary strings.
# Those binary strings are in 'file_name' keys, they transit through
# $self->{'input'} and end up in 'source_info' in tree elements and
# in error messages.
#
# The following parser information is directly determined from the
# input file name as binary strings
# ->{'global_info'}->{'input_file_name'}
# ->{'global_info'}->{'input_directory'}
package Texinfo::Parser;
# We need the unicode stuff.
use 5.006;
use strict;
# stop \s from matching non-ASCII spaces, etc. \p{...} can still be
# used to match Unicode character classes.
use if $] >= 5.014, re => '/a';
# check that autovivification do not happen incorrectly.
#no autovivification qw(fetch delete exists store strict);
# debug
use Carp qw(cluck confess);
#use Data::Dumper;
# to detect if an encoding may be used to open the files
# to encode/decode in-memory strings used as files
use Encode qw(find_encoding decode encode);
# for fileparse
use File::Basename;
# Clone could be faster for small structures, which should be the case
# here, but Clone is not in Perl core modules, so we use Storable::dclone.
use Storable qw(dclone); # standard in 5.007003
# commands definitions
use Texinfo::Commands;
use Texinfo::Common;
# Error reporting and counting
use Texinfo::Report;
# fpr tree copy
use Texinfo::ManipulateTree;
# To register the parsed manual and associated information
# and also to call set_labels_identifiers_target.
use Texinfo::Document;
# in error messages, and for macro body expansion
use Texinfo::Convert::Texinfo;
# to normalize names
use Texinfo::Convert::NodeNameNormalization;
# to complete indices translations.
use Texinfo::Translations;
require Exporter;
our $module_loaded = 0;
sub import {
if (!$module_loaded) {
Texinfo::XSLoader::override ("Texinfo::Parser::_parse_texi_regex",
"Texinfo::MiscXS::parse_texi_regex");
Texinfo::XSLoader::override ("Texinfo::Parser::_parse_command_name",
"Texinfo::MiscXS::parse_command_name");
$module_loaded = 1;
}
# The usual import method
goto &Exporter::import;
}
our $VERSION = '7.2';
# Document information set in the parser. The initialization is done by
# Texinfo::Document::new_document and afterwards the Texinfo::Document
# document is available in the 'document' key in the parser and
# document hash keys are directly accessed in the parser for efficiency
#'commands_info' => {}, # keys are @-commands names (without @) and
# values are arrays for global multiple
# @-commands and a value for non multiple
# global @-commands.
#'listoffloats_list' => {}, # key is the normalized float type, value is
# an array reference holding all the floats
# of that type.
#'identifiers_target' => {}, # keys are normalized label names, as described
# in the `HTML Xref' node. Value should be
# a node/anchor or float in the tree.
#'internal_references' => [], # list of elements source of cross-references,
# commands like @ref without books or external
# manual files, and menu entries without
# external manual.
#'labels_list' => [], # array of elements associated with labels.
# information on document
#'global_info' => {'input_encoding_name' => 'utf-8',
# 'included_files' => [],},
# indices a structure holding the link between index
# names and merged indices;
# initial value is %index_names in Texinfo::Commands.
# these are the default values for the parsing state of a document.
# Some could become configurable if moved to Texinfo::Common
# %parser_document_state_configuration,
# but they are not configurable/implemented in the XS parser, so they are
# best left internal. Could be relevant to reuse for diverse sources
# of input associated to the same document.
my %parser_document_state_initialization = (
# parsed document parsing information still relevant after parsing
'aliases' => {}, # key is a command name value is the alias
'macros' => {}, # the key is the user-defined macro name. The
# value is the reference on a macro element
# as obtained by parsing the @macro
'definfoenclose' => {}, # key is the command name, value is an array
# reference with 2 values, beginning and ending.
# parsing information still relevant at the end of the parsing
'clickstyle' => 'arrow', #
'kbdinputstyle' => 'distinct', #
'source_mark_counters' => {}, #
'current_node' => undef, # last seen node.
'current_section' => undef, # last seen section.
'current_part' => undef, # last seen part.
'internal_space_holder' => undef,
# the element associated with the last internal spaces element added.
# We know that there can only be one at a time as a non space
# character should always lead to abort_empty_line or another
# function being called and the internal space element being
# removed or put in the internal_space_holder info.
# NOTE internal_space_holder is already unset in abort_empty_line
# if the internal space element is put in the internal_space_holder.
# It would be cleaner to unset internal_space_holder in all the
# cases where the internal space element is removed too, such that
# when internal_space_holder is set the previous value is undef and not
# the previous internal_space_holder, which is now irrelevant as
# its associated space has disappeared.
'sections_level_modifier' => 0, # modified by raise/lowersections
'input_file_encoding' => 'utf-8', # encoding name used for the input file
);
my %parsing_state_initialization = (
# parsing information only relevant during an input source parsing
'input' => [], # a stack, with last at bottom. Holds the opened files
# or text. Pending macro expansion or text expansion
# is also in that structure.
'conditional_stack' => [], # a stack of conditional commands that are
# expanded.
'macro_block_stack' => [], # a stack of *macro block commands that are nested.
'macro_expansion_nr' => 0, # number of macros being expanded
'value_expansion_nr' => 0, # number of values being expanded
'nesting_context' => {
# key is the context name, value is the
# depth of the context.
'basic_inline_stack' => [],
'basic_inline_stack_on_line' => [],
'basic_inline_stack_block' => [],
'regions_stack' => [],
'footnote' => 0,
'caption' => 0,
},
'context_stack' => [],
# stack of the contexts, more recent on top.
# 'ct_line' is added when on a line or
# block @-command line,
# 'ct_def' is added instead if on a definition line.
# 'ct_preformatted' is added in block commands
# where there is no paragraphs and spaces are kept
# (format, example, display and menu commands...)
# 'ct_math' is added in math block commands
# (displaymath) and @math brace commands
# 'ct_rawpreformatted' is added in raw block commands
# (html, xml, docbook...)
# 'ct_inlineraw' is added when in inlineraw
# 'ct_base' is (re-)added when in footnote,
# caption, or shortcaption (context brace_commands
# that does not already start another context, ie not
# math).
# 'ct_paragraph' is added in paragraph.
'context_command_stack' => [],
# the stack of @-commands. An @-command name can
# be added each time a context is pushed on
# 'context_stack'. Could be undef if there
# is no @-command associated with the context.
);
my %parser_state_initialization = (%parser_document_state_initialization,
%parsing_state_initialization);
# other possible keys for the parser state initialized based
# on customization variables:
# parsing information still relevant at the end of the parsing
# line_commands the same as %line_commands, but with index entry
# commands dynamically added.
# brace_commands the same as %brace_commands, but with definfoenclose
# commands dynamically added.
# valid_nestings direct command valid nesting information, with
# index entry commands dynamically added.
# no_paragraph_commands the same as %no_paragraph_commands,
# with new index entry commands dynamically added.
# basic_inline_commands the same as %contain_basic_inline_commands below, but
# with new index entry commands dynamically added
# command_index associate a command name with an index name.
# index_entry_commands index entry commands, including added index commands.
# parser keys related to customization
# expanded_formats_hash each key comes from EXPANDED_FORMATS, value is 1
# set points to the value set when initializing, for
# configuration items that are not to be overriden
# by @-commands. For example documentlanguage.
# conf Customization and document state configuration
# based on defaults and parser argument.
# other keys for the parser state initialized at parser creation
# registrar # Texinfo::Report object used for error
# # reporting.
# A source information is an hash reference with the keys:
# line_nr the line number.
# file_name the file name, a binary string.
# macro if in a macro expansion, the name of the macro.
# The input structure is an array, the first is the most recently included
# file. The last element may correspond to a file if the parsing is done
# on a file, with parse_texi_file, or hold pending text, if called on text.
# each element of the array is a hash reference.
#
# The keys are:
# for both text and file:
# source_info source information corresponding to the current file.
# input_source_mark source mark associated with the input (include file,
# macro or value expansion).
#
# for text:
# th handle for text given in input or expansion text
# of value or macro.
# value_flag set if the text corresponds to a @value command
# expansion.
# macro_name set if the text corresponds to a new macro expansion.
#
# for a file:
# fh filehandle for the file.
# input_file_path file path.
# The commands in initialization_overrides are not set in the document if
# set at the parser initialization.
my %initialization_overrides = (
'documentlanguage' => 1,
);
my %nobrace_commands = %Texinfo::Commands::nobrace_commands;
my %line_commands = %Texinfo::Commands::line_commands;
my %brace_commands = %Texinfo::Commands::brace_commands;
my %commands_args_number = %Texinfo::Commands::commands_args_number;
my %accent_commands = %Texinfo::Commands::accent_commands;
my %contain_plain_text_commands = %Texinfo::Commands::contain_plain_text_commands;
my %contain_basic_inline_commands = %Texinfo::Commands::contain_basic_inline_commands;
my %block_commands = %Texinfo::Commands::block_commands;
my %blockitem_commands = %Texinfo::Commands::blockitem_commands;
my %close_paragraph_commands = %Texinfo::Commands::close_paragraph_commands;
my %def_commands = %Texinfo::Commands::def_commands;
my %def_alias_commands = %Texinfo::Commands::def_alias_commands;
my %preformatted_commands = %Texinfo::Commands::preformatted_commands;
my %math_commands = %Texinfo::Commands::math_commands;
my %deprecated_commands = %Texinfo::Commands::deprecated_commands;
my %root_commands = %Texinfo::Commands::root_commands;
my %sectioning_heading_commands = %Texinfo::Commands::sectioning_heading_commands;
my %ref_commands = %Texinfo::Commands::ref_commands;
my %heading_spec_commands = %Texinfo::Commands::heading_spec_commands;
my %in_heading_spec_commands = %Texinfo::Commands::in_heading_spec_commands;
my %variadic_commands = %Texinfo::Commands::variadic_commands;
my %default_index_commands = %Texinfo::Commands::default_index_commands;
my %global_multiple_commands = %Texinfo::Commands::global_commands;
my %global_unique_commands = %Texinfo::Commands::global_unique_commands;
my %in_index_commands = %Texinfo::Commands::in_index_commands;
my %explained_commands = %Texinfo::Commands::explained_commands;
my %inline_format_commands = %Texinfo::Commands::inline_format_commands;
my %index_entry_command_commands = %Texinfo::Commands::index_entry_command_commands;
my %def_map = %Texinfo::Common::def_map;
my %def_aliases = %Texinfo::Common::def_aliases;
my %all_commands = %Texinfo::Common::all_commands;
my %encoding_name_conversion_map
= %Texinfo::Common::encoding_name_conversion_map;
# Keys are commmands, values are names of indices. User-defined
# index commands are added dynamically.
my %command_index;
$command_index{'vtable'} = 'vr';
$command_index{'ftable'} = 'fn';
foreach my $index_command (keys(%default_index_commands)) {
$command_index{$index_command} = $default_index_commands{$index_command};
}
# the type of index, fn: function, vr: variable, tp: type
my %index_type_def = (
'fn' => ['deffn', 'deftypefn', 'deftypeop', 'defop'],
'vr' => ['defvr', 'deftypevr', 'defcv', 'deftypecv' ],
'tp' => ['deftp']
);
foreach my $index_type (keys %index_type_def) {
foreach my $def (@{$index_type_def{$index_type}}) {
$command_index{$def} = $index_type;
}
}
foreach my $def_command(keys %def_map) {
if (ref($def_map{$def_command}) eq 'HASH') {
my ($real_command) = keys (%{$def_map{$def_command}});
$command_index{$def_command} = $command_index{$real_command};
}
$command_index{$def_command.'x'} = $command_index{$def_command};
}
# equivalence between a @set flag and an @@-command
my %set_flag_command_equivalent = (
'txicodequoteundirected' => 'codequoteundirected',
'txicodequotebacktick' => 'codequotebacktick',
# 'txideftypefnnl' => 'deftypefnnewline',
);
# could be moved to Texinfo::Common if needed more generally
# same order as in XS parser
my @set_flag_index_char_ignore = (
['txiindexbackslashignore', '\\'],
['txiindexhyphenignore', '-'],
['txiindexlessthanignore', '<'],
['txiindexatsignignore', '@'],
);
# after checking that the context is in begin_paragraph_contexts, the list
# of types in which paragraphs are not started.
my %type_without_paragraph;
foreach my $type ('brace_arg', 'brace_container') {
$type_without_paragraph{$type} = 1;
};
# To keep in sync with XS main/element_types.txt leading_space flag
my %leading_space_types;
foreach my $type ('empty_line', 'ignorable_spaces_after_command',
'internal_spaces_after_command', 'internal_spaces_before_argument',
'internal_spaces_before_context_argument',
'spaces_after_close_brace') {
$leading_space_types{$type} = 1;
}
my %command_ignore_space_after;
foreach my $command ('anchor', 'hyphenation', 'caption', 'shortcaption',
'sortas', 'seeentry', 'seealso') {
$command_ignore_space_after{$command} = 1;
}
# @-commands that should be at the beginning of a line
my %begin_line_commands;
foreach my $command ('node', 'end') {
$begin_line_commands{$command} = $command;
}
foreach my $begin_line_command (keys(%line_commands)) {
$begin_line_commands{$begin_line_command} = 1;
}
foreach my $not_begin_line_command ('comment', 'c', 'columnfractions',
'item', 'subentry') {
delete $begin_line_commands{$not_begin_line_command};
}
# default indices
my %index_names = %Texinfo::Commands::index_names;
# @-commands that do not start a paragraph
my %no_paragraph_commands = %Texinfo::Commands::no_paragraph_commands;
# does not include index commands
my %close_preformatted_commands = %close_paragraph_commands;
my %close_paragraph_not_preformatted = ('sp' => 1);
foreach my $no_close_preformatted(keys(%close_paragraph_not_preformatted)) {
delete $close_preformatted_commands{$no_close_preformatted};
}
foreach my $block_command (keys(%block_commands)) {
$begin_line_commands{$block_command} = 1;
}
# commands that may appear in commands containing plain text only
my %in_plain_text_commands = %accent_commands;
foreach my $brace_command(keys(%brace_commands)) {
$in_plain_text_commands{$brace_command} = 1
if ($brace_commands{$brace_command} eq 'noarg');
}
my %symbol_nobrace_commands;
foreach my $no_brace_command (keys(%nobrace_commands)) {
if ($nobrace_commands{$no_brace_command} eq 'symbol'
and !$in_heading_spec_commands{$no_brace_command}) {
$symbol_nobrace_commands{$no_brace_command} = 1;
$in_plain_text_commands{$no_brace_command} = 1;
}
}
$in_plain_text_commands{'c'} = 1;
$in_plain_text_commands{'comment'} = 1;
# commands that may appear in any text argument, similar constraints
# as in paragraphs.
my %in_full_text_commands;
# start from all the brace commands
foreach my $command (keys(%brace_commands), keys(%symbol_nobrace_commands)) {
$in_full_text_commands{$command} = 1;
}
# selected line and nobrace commands
foreach my $in_full_text_command ('c', 'comment', 'refill', 'subentry',
'columnfractions', 'set', 'clear', 'end') {
$in_full_text_commands{$in_full_text_command} = 1;
}
# selected block commands
foreach my $block_command (keys(%block_commands)) {
$in_full_text_commands{$block_command} = 1
if ($block_commands{$block_command} eq 'conditional'
or $block_commands{$block_command} eq 'format_raw');
}
# sort out brace commmands and setup command list appearing in more
# restricted context.
# those two commands are not allowed in any command except for @float */
delete $in_full_text_commands{'caption'};
delete $in_full_text_commands{'shortcaption'};
# commands that accept full text, but no block or top-level commands
my %contain_full_text_commands;
foreach my $brace_command (keys (%brace_commands)) {
next if (exists($contain_plain_text_commands{$brace_command}));
if ($brace_commands{$brace_command} eq 'style_code'
or $brace_commands{$brace_command} eq 'style_other'
or $brace_commands{$brace_command} eq 'style_no_code') {
$contain_full_text_commands{$brace_command} = 1;
}
}
foreach my $line_command ('center', 'exdent', 'item', 'itemx',
'nodedescription') {
$contain_full_text_commands{$line_command} = 1;
}
# Fill the valid nestings hash. The keys are the containing commands and
# the values arrays of commands that are allowed to occur inside those
# commands. All commands not in this hash are considered to accept anything.
# There are additional context tests, to make sure, for instance that we are
# testing @-commands on the block, line or node @-command line and not
# in the content.
my %default_valid_nestings;
foreach my $command (keys(%contain_plain_text_commands)) {
$default_valid_nestings{$command} = \%in_plain_text_commands;
}
foreach my $command (keys(%contain_full_text_commands)) {
$default_valid_nestings{$command} = \%in_full_text_commands;
}
# @this* commands should not appear in any line command except for
# page heading specification commands and can also appear in brace @-commands,
# on heading specification commands lines, such as indicatric @-commands.
foreach my $brace_command (keys (%brace_commands)) {
if ($brace_commands{$brace_command} eq 'style_code'
or $brace_commands{$brace_command} eq 'style_other'
or $brace_commands{$brace_command} eq 'style_no_code') {
# duplicate hash to avoid modifying shared structure
$default_valid_nestings{$brace_command}
= { %{$default_valid_nestings{$brace_command}} };
foreach my $in_heading_spec (keys(%in_heading_spec_commands)) {
$default_valid_nestings{$brace_command}->{$in_heading_spec} = 1;
}
}
}
# For _check_valid_nesting_context
my %in_basic_inline_commands = %in_full_text_commands;
foreach my $not_in_basic_inline_commands
('xref', 'ref', 'pxref', 'inforef',
'titlefont', 'anchor', 'footnote', 'verb') {
delete $in_basic_inline_commands{$not_in_basic_inline_commands};
}
foreach my $in_heading_spec (keys(%in_heading_spec_commands)) {
$in_basic_inline_commands{$in_heading_spec} = 1;
}
my %contain_basic_inline_with_refs_commands = (%sectioning_heading_commands,
%def_commands);
my %ok_in_basic_inline_with_refs_commands;
foreach my $permitted_command ('xref', 'ref', 'pxref', 'inforef') {
$ok_in_basic_inline_with_refs_commands{$permitted_command} = 1;
}
my %not_in_region_commands;
foreach my $block_command (keys(%block_commands)) {
$not_in_region_commands{$block_command} = 1
if ($block_commands{$block_command} eq 'region');
}
# index names that cannot be set by the user.
my %forbidden_index_name = ();
foreach my $name (keys(%index_names)) {
$forbidden_index_name{$name} = 1;
if ($name =~ /^(.).$/) {
$forbidden_index_name{$1} = 1;
}
}
foreach my $other_forbidden_index_name ('info','ps','pdf','htm',
'html', 'log','aux','dvi','texi','txi','texinfo','tex','bib') {
$forbidden_index_name{$other_forbidden_index_name} = 1;
}
my %canonical_texinfo_encodings;
# Valid encodings as described in the Texinfo manual
foreach my $canonical_encoding ('us-ascii', 'utf-8', 'iso-8859-1',
'iso-8859-15', 'iso-8859-2', 'koi8-r', 'koi8-u') {
$canonical_texinfo_encodings{$canonical_encoding} = 1;
}
my %begin_paragraph_contexts;
foreach my $begin_paragraph_context ('base') {
$begin_paragraph_contexts{'ct_'.$begin_paragraph_context} = 1;
}