Filename | /usr/share/perl/5.36/Pod/Simple/BlackBox.pm |
Statements | Executed 105 statements in 7.66ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
9 | 9 | 2 | 622µs | 883µs | my_qr | Pod::Simple::BlackBox::
9 | 9 | 9 | 66µs | 66µs | CORE:match (opcode) | Pod::Simple::BlackBox::
1 | 1 | 1 | 11µs | 15µs | BEGIN@289 | Pod::Simple::BlackBox::
1 | 1 | 1 | 10µs | 13µs | BEGIN@1520 | Pod::Simple::BlackBox::
10 | 10 | 10 | 8µs | 8µs | CORE:qr (opcode) | Pod::Simple::BlackBox::
1 | 1 | 1 | 7µs | 8µs | BEGIN@21 | Pod::Simple::BlackBox::
1 | 1 | 1 | 5µs | 14µs | BEGIN@273 | Pod::Simple::BlackBox::
1 | 1 | 1 | 4µs | 16µs | BEGIN@67 | Pod::Simple::BlackBox::
1 | 1 | 1 | 3µs | 4µs | BEGIN@22 | Pod::Simple::BlackBox::
1 | 1 | 1 | 3µs | 19µs | BEGIN@24 | Pod::Simple::BlackBox::
1 | 1 | 1 | 3µs | 3µs | BEGIN@55 | Pod::Simple::BlackBox::
1 | 1 | 1 | 2µs | 2µs | BEGIN@23 | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _closers_for_all_curr_open | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _dump_curr_open | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _gen_errata | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _handle_encoding_line | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _handle_encoding_second_level | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _ponder_Data | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _ponder_Plain | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _ponder_Verbatim | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _ponder_back | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _ponder_begin | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _ponder_doc_end | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _ponder_end | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _ponder_for | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _ponder_item | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _ponder_over | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _ponder_paragraph_buffer | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _ponder_pod | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _stringify_lol | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _traverse_treelet_bit | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _treelet_from_formatting_codes | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | _verbatim_format | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | parse_line | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | parse_lines | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | pretty | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | reinit | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | stringify_lol | Pod::Simple::BlackBox::
0 | 0 | 0 | 0s | 0s | text_content_of_treelet | Pod::Simple::BlackBox::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | package Pod::Simple::BlackBox; | ||||
2 | # | ||||
3 | # "What's in the box?" "Pain." | ||||
4 | # | ||||
5 | ########################################################################### | ||||
6 | # | ||||
7 | # This is where all the scary things happen: parsing lines into | ||||
8 | # paragraphs; and then into directives, verbatims, and then also | ||||
9 | # turning formatting sequences into treelets. | ||||
10 | # | ||||
11 | # Are you really sure you want to read this code? | ||||
12 | # | ||||
13 | #----------------------------------------------------------------------------- | ||||
14 | # | ||||
15 | # The basic work of this module Pod::Simple::BlackBox is doing the dirty work | ||||
16 | # of parsing Pod into treelets (generally one per non-verbatim paragraph), and | ||||
17 | # to call the proper callbacks on the treelets. | ||||
18 | # | ||||
19 | # Every node in a treelet is a ['name', {attrhash}, ...children...] | ||||
20 | |||||
21 | 2 | 13µs | 2 | 9µs | # spent 8µs (7+900ns) within Pod::Simple::BlackBox::BEGIN@21 which was called:
# once (7µs+900ns) by Pod::Simple::LinkSection::BEGIN@7 at line 21 # spent 8µs making 1 call to Pod::Simple::BlackBox::BEGIN@21
# spent 900ns making 1 call to integer::import |
22 | 2 | 12µs | 2 | 4µs | # spent 4µs (3+600ns) within Pod::Simple::BlackBox::BEGIN@22 which was called:
# once (3µs+600ns) by Pod::Simple::LinkSection::BEGIN@7 at line 22 # spent 4µs making 1 call to Pod::Simple::BlackBox::BEGIN@22
# spent 600ns making 1 call to strict::import |
23 | 2 | 12µs | 1 | 2µs | # spent 2µs within Pod::Simple::BlackBox::BEGIN@23 which was called:
# once (2µs+0s) by Pod::Simple::LinkSection::BEGIN@7 at line 23 # spent 2µs making 1 call to Pod::Simple::BlackBox::BEGIN@23 |
24 | 2 | 89µs | 2 | 34µs | # spent 19µs (3+16) within Pod::Simple::BlackBox::BEGIN@24 which was called:
# once (3µs+16µs) by Pod::Simple::LinkSection::BEGIN@7 at line 24 # spent 19µs making 1 call to Pod::Simple::BlackBox::BEGIN@24
# spent 16µs making 1 call to vars::import |
25 | 1 | 300ns | $VERSION = '3.43'; | ||
26 | #use constant DEBUG => 7; | ||||
27 | |||||
28 | # spent 883µs (622+261) within Pod::Simple::BlackBox::my_qr which was called 9 times, avg 98µs/call:
# once (96µs+35µs) by Pod::Text::BEGIN@26 at line 351 of Pod/Simple.pm
# once (87µs+38µs) by Pod::Simple::LinkSection::BEGIN@7 at line 63
# once (77µs+25µs) by Pod::Simple::LinkSection::BEGIN@7 at line 88
# once (58µs+37µs) by Pod::Simple::LinkSection::BEGIN@7 at line 76
# once (69µs+24µs) by Pod::Simple::LinkSection::BEGIN@7 at line 67
# once (60µs+28µs) by Pod::Simple::LinkSection::BEGIN@7 at line 68
# once (63µs+24µs) by Pod::Simple::LinkSection::BEGIN@7 at line 85
# once (59µs+27µs) by Pod::Simple::LinkSection::BEGIN@7 at line 70
# once (54µs+22µs) by Pod::Simple::LinkSection::BEGIN@7 at line 92 | ||||
29 | |||||
30 | # $1 is a pattern to compile and return. Older perls compile any | ||||
31 | # syntactically valid property, even if it isn't legal. To cope with | ||||
32 | # this, return an empty string unless the compiled pattern also | ||||
33 | # successfully matches $2, which the caller furnishes. | ||||
34 | |||||
35 | 9 | 3µs | my ($input_re, $should_match) = @_; | ||
36 | # XXX could have a third parameter $shouldnt_match for extra safety | ||||
37 | |||||
38 | 9 | 16µs | my $use_utf8 = ($] le 5.006002) ? 'use utf8;' : ""; | ||
39 | |||||
40 | 9 | 154µs | my $re = eval "no warnings; $use_utf8 qr/$input_re/"; # spent 34µs executing statements in string eval # includes 8µs spent executing 1 call to 1 sub defined therein. # spent 29µs executing statements in string eval # includes 4µs spent executing 1 call to 1 sub defined therein. # spent 25µs executing statements in string eval # includes 8µs spent executing 1 call to 1 sub defined therein. # spent 20µs executing statements in string eval # includes 4µs spent executing 1 call to 1 sub defined therein. # spent 18µs executing statements in string eval # includes 4µs spent executing 1 call to 1 sub defined therein. # spent 17µs executing statements in string eval # includes 3µs spent executing 1 call to 1 sub defined therein. # spent 16µs executing statements in string eval # includes 4µs spent executing 1 call to 1 sub defined therein. # spent 15µs executing statements in string eval # includes 4µs spent executing 1 call to 1 sub defined therein. # spent 15µs executing statements in string eval # includes 3µs spent executing 1 call to 1 sub defined therein. | ||
41 | #print STDERR __LINE__, ": $input_re: $@\n" if $@; | ||||
42 | 9 | 1µs | return "" if $@; | ||
43 | |||||
44 | 9 | 140µs | my $matches = eval "no warnings; $use_utf8 '$should_match' =~ /$re/"; # spent 38µs executing statements in string eval # includes 3µs spent executing 1 call to 1 sub defined therein. # spent 31µs executing statements in string eval # includes 4µs spent executing 1 call to 1 sub defined therein. # spent 31µs executing statements in string eval # includes 4µs spent executing 1 call to 1 sub defined therein. # spent 28µs executing statements in string eval # includes 4µs spent executing 1 call to 1 sub defined therein. # spent 27µs executing statements in string eval # includes 4µs spent executing 1 call to 1 sub defined therein. # spent 25µs executing statements in string eval # includes 5µs spent executing 1 call to 1 sub defined therein. # spent 24µs executing statements in string eval # includes 4µs spent executing 1 call to 1 sub defined therein. # spent 21µs executing statements in string eval # includes 4µs spent executing 1 call to 1 sub defined therein. # spent 21µs executing statements in string eval # includes 6µs spent executing 1 call to 1 sub defined therein. | ||
45 | #print STDERR __LINE__, ": $input_re: $@\n" if $@; | ||||
46 | 9 | 1µs | return "" if $@; | ||
47 | |||||
48 | #print STDERR __LINE__, ": SUCCESS: $re\n" if $matches; | ||||
49 | 9 | 16µs | return $re if $matches; | ||
50 | |||||
51 | #print STDERR __LINE__, ": $re: didn't match\n"; | ||||
52 | return ""; | ||||
53 | } | ||||
54 | |||||
55 | # spent 3µs within Pod::Simple::BlackBox::BEGIN@55 which was called:
# once (3µs+0s) by Pod::Simple::LinkSection::BEGIN@7 at line 58 | ||||
56 | 1 | 500ns | require Pod::Simple; | ||
57 | 1 | 3µs | *DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG | ||
58 | 1 | 38µs | 1 | 3µs | } # spent 3µs making 1 call to Pod::Simple::BlackBox::BEGIN@55 |
59 | |||||
60 | # Matches a character iff the character will have a different meaning | ||||
61 | # if we choose CP1252 vs UTF-8 if there is no =encoding line. | ||||
62 | # This is broken for early Perls on non-ASCII platforms. | ||||
63 | 1 | 2µs | 1 | 125µs | my $non_ascii_re = my_qr('[[:^ascii:]]', "\xB6"); # spent 125µs making 1 call to Pod::Simple::BlackBox::my_qr |
64 | 1 | 200ns | $non_ascii_re = qr/[\x80-\xFF]/ unless $non_ascii_re; | ||
65 | |||||
66 | # Use patterns understandable by Perl 5.6, if possible | ||||
67 | 4 | 481µs | 3 | 121µs | # spent 16µs (4+12) within Pod::Simple::BlackBox::BEGIN@67 which was called:
# once (4µs+12µs) by Pod::Simple::LinkSection::BEGIN@7 at line 67 # spent 93µs making 1 call to Pod::Simple::BlackBox::my_qr
# spent 16µs making 1 call to Pod::Simple::BlackBox::BEGIN@67
# spent 12µs making 1 call to warnings::unimport |
68 | 1 | 800ns | 1 | 87µs | my $cn_re = my_qr('\p{IsCn}', "\x{09E4}"); # <reserved> code point unlikely # spent 87µs making 1 call to Pod::Simple::BlackBox::my_qr |
69 | # to get assigned | ||||
70 | 1 | 800ns | 1 | 86µs | my $rare_blocks_re = my_qr('[\p{InIPAExtensions}\p{InSpacingModifierLetters}]', # spent 86µs making 1 call to Pod::Simple::BlackBox::my_qr |
71 | "\x{250}"); | ||||
72 | 1 | 100ns | $rare_blocks_re = my_qr('[\x{0250}-\x{02FF}]', "\x{250}") unless $rare_blocks_re; | ||
73 | |||||
74 | 1 | 16µs | my $script_run_re = eval 'no warnings "experimental::script_run"; # spent 18µs executing statements in string eval # includes 4µs spent executing 1 call to 1 sub defined therein. | ||
75 | qr/(*script_run: ^ .* $ )/x'; | ||||
76 | 1 | 900ns | 1 | 95µs | my $latin_re = my_qr('[\p{IsLatin}\p{IsInherited}\p{IsCommon}]', "\x{100}"); # spent 95µs making 1 call to Pod::Simple::BlackBox::my_qr |
77 | 1 | 200ns | unless ($latin_re) { | ||
78 | # This was machine generated to be the ranges of the union of the above | ||||
79 | # three properties, with things that were undefined by Unicode 4.1 filling | ||||
80 | # gaps. That is the version in use when Perl advanced enough to | ||||
81 | # successfully compile and execute the above pattern. | ||||
82 | $latin_re = my_qr('[\x00-\x{02E9}\x{02EC}-\x{0374}\x{037E}\x{0385}\x{0387}\x{0485}\x{0486}\x{0589}\x{060C}\x{061B}\x{061F}\x{0640}\x{064B}-\x{0655}\x{0670}\x{06DD}\x{0951}-\x{0954}\x{0964}\x{0965}\x{0E3F}\x{10FB}\x{16EB}-\x{16ED}\x{1735}\x{1736}\x{1802}\x{1803}\x{1805}\x{1D00}-\x{1D25}\x{1D2C}-\x{1D5C}\x{1D62}-\x{1D65}\x{1D6B}-\x{1D77}\x{1D79}-\x{1DBE}\x{1DC0}-\x{1EF9}\x{2000}-\x{2125}\x{2127}-\x{27FF}\x{2900}-\x{2B13}\x{2E00}-\x{2E1D}\x{2FF0}-\x{3004}\x{3006}\x{3008}-\x{3020}\x{302A}-\x{302D}\x{3030}-\x{3037}\x{303C}-\x{303F}\x{3099}-\x{309C}\x{30A0}\x{30FB}\x{30FC}\x{3190}-\x{319F}\x{31C0}-\x{31CF}\x{3220}-\x{325F}\x{327F}-\x{32CF}\x{3358}-\x{33FF}\x{4DC0}-\x{4DFF}\x{A700}-\x{A716}\x{FB00}-\x{FB06}\x{FD3E}\x{FD3F}\x{FE00}-\x{FE6B}\x{FEFF}-\x{FF65}\x{FF70}\x{FF9E}\x{FF9F}\x{FFE0}-\x{FFFD}\x{10100}-\x{1013F}\x{1D000}-\x{1D1DD}\x{1D300}-\x{1D7FF}]', "\x{100}"); | ||||
83 | } | ||||
84 | |||||
85 | 1 | 1µs | 1 | 87µs | my $every_char_is_latin_re = my_qr("^(?:$latin_re)*\\z", "A"); # spent 87µs making 1 call to Pod::Simple::BlackBox::my_qr |
86 | |||||
87 | # Latin script code points not in the first release of Unicode | ||||
88 | 1 | 800ns | 1 | 103µs | my $later_latin_re = my_qr('[^\P{IsLatin}\p{IsAge=1.1}]', "\x{1F6}"); # spent 103µs making 1 call to Pod::Simple::BlackBox::my_qr |
89 | |||||
90 | # If this perl doesn't have the Deprecated property, there's only one code | ||||
91 | # point in it that we need be concerned with. | ||||
92 | 1 | 700ns | 1 | 76µs | my $deprecated_re = my_qr('\p{IsDeprecated}', "\x{149}"); # spent 76µs making 1 call to Pod::Simple::BlackBox::my_qr |
93 | 1 | 200ns | $deprecated_re = qr/\x{149}/ unless $deprecated_re; | ||
94 | |||||
95 | 1 | 200ns | my $utf8_bom; | ||
96 | 1 | 2µs | if (($] ge 5.007_003)) { | ||
97 | 1 | 300ns | $utf8_bom = "\x{FEFF}"; | ||
98 | 1 | 3µs | 1 | 700ns | utf8::encode($utf8_bom); # spent 700ns making 1 call to utf8::encode |
99 | } else { | ||||
100 | $utf8_bom = "\xEF\xBB\xBF"; # No EBCDIC BOM detection for early Perls. | ||||
101 | } | ||||
102 | |||||
103 | # This is used so that the 'content_seen' method doesn't return true on a | ||||
104 | # file that just happens to have a line that matches /^=[a-zA-z]/. Only if | ||||
105 | # there is a valid =foo line will we return that content was seen. | ||||
106 | 1 | 100ns | my $seen_legal_directive = 0; | ||
107 | |||||
108 | #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | ||||
109 | |||||
110 | sub parse_line { shift->parse_lines(@_) } # alias | ||||
111 | |||||
112 | # - - - Turn back now! Run away! - - - | ||||
113 | |||||
114 | sub parse_lines { # Usage: $parser->parse_lines(@lines) | ||||
115 | # an undef means end-of-stream | ||||
116 | my $self = shift; | ||||
117 | |||||
118 | my $code_handler = $self->{'code_handler'}; | ||||
119 | my $cut_handler = $self->{'cut_handler'}; | ||||
120 | my $wl_handler = $self->{'whiteline_handler'}; | ||||
121 | $self->{'line_count'} ||= 0; | ||||
122 | |||||
123 | my $scratch; | ||||
124 | |||||
125 | DEBUG > 4 and | ||||
126 | print STDERR "# Parsing starting at line ", $self->{'line_count'}, ".\n"; | ||||
127 | |||||
128 | DEBUG > 5 and | ||||
129 | print STDERR "# About to parse lines: ", | ||||
130 | join(' ', map defined($_) ? "[$_]" : "EOF", @_), "\n"; | ||||
131 | |||||
132 | my $paras = ($self->{'paras'} ||= []); | ||||
133 | # paragraph buffer. Because we need to defer processing of =over | ||||
134 | # directives and verbatim paragraphs. We call _ponder_paragraph_buffer | ||||
135 | # to process this. | ||||
136 | |||||
137 | $self->{'pod_para_count'} ||= 0; | ||||
138 | |||||
139 | # An attempt to match the pod portions of a line. This is not fool proof, | ||||
140 | # but is good enough to serve as part of the heuristic for guessing the pod | ||||
141 | # encoding if not specified. | ||||
142 | my $codes = join '', grep { / ^ [A-Za-z] $/x } sort keys %{$self->{accept_codes}}; | ||||
143 | my $pod_chars_re = qr/ ^ = [A-Za-z]+ | [\Q$codes\E] < /x; | ||||
144 | |||||
145 | my $line; | ||||
146 | foreach my $source_line (@_) { | ||||
147 | if( $self->{'source_dead'} ) { | ||||
148 | DEBUG > 4 and print STDERR "# Source is dead.\n"; | ||||
149 | last; | ||||
150 | } | ||||
151 | |||||
152 | unless( defined $source_line ) { | ||||
153 | DEBUG > 4 and print STDERR "# Undef-line seen.\n"; | ||||
154 | |||||
155 | push @$paras, ['~end', {'start_line' => $self->{'line_count'}}]; | ||||
156 | push @$paras, $paras->[-1], $paras->[-1]; | ||||
157 | # So that it definitely fills the buffer. | ||||
158 | $self->{'source_dead'} = 1; | ||||
159 | $self->_ponder_paragraph_buffer; | ||||
160 | next; | ||||
161 | } | ||||
162 | |||||
163 | |||||
164 | if( $self->{'line_count'}++ ) { | ||||
165 | ($line = $source_line) =~ tr/\n\r//d; | ||||
166 | # If we don't have two vars, we'll end up with that there | ||||
167 | # tr/// modding the (potentially read-only) original source line! | ||||
168 | |||||
169 | } else { | ||||
170 | DEBUG > 2 and print STDERR "First line: [$source_line]\n"; | ||||
171 | |||||
172 | if( ($line = $source_line) =~ s/^$utf8_bom//s ) { | ||||
173 | DEBUG and print STDERR "UTF-8 BOM seen. Faking a '=encoding utf8'.\n"; | ||||
174 | $self->_handle_encoding_line( "=encoding utf8" ); | ||||
175 | delete $self->{'_processed_encoding'}; | ||||
176 | $line =~ tr/\n\r//d; | ||||
177 | |||||
178 | } elsif( $line =~ s/^\xFE\xFF//s ) { | ||||
179 | DEBUG and print STDERR "Big-endian UTF-16 BOM seen. Aborting parsing.\n"; | ||||
180 | $self->scream( | ||||
181 | $self->{'line_count'}, | ||||
182 | "UTF16-BE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet." | ||||
183 | ); | ||||
184 | splice @_; | ||||
185 | push @_, undef; | ||||
186 | next; | ||||
187 | |||||
188 | # TODO: implement somehow? | ||||
189 | |||||
190 | } elsif( $line =~ s/^\xFF\xFE//s ) { | ||||
191 | DEBUG and print STDERR "Little-endian UTF-16 BOM seen. Aborting parsing.\n"; | ||||
192 | $self->scream( | ||||
193 | $self->{'line_count'}, | ||||
194 | "UTF16-LE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet." | ||||
195 | ); | ||||
196 | splice @_; | ||||
197 | push @_, undef; | ||||
198 | next; | ||||
199 | |||||
200 | # TODO: implement somehow? | ||||
201 | |||||
202 | } else { | ||||
203 | DEBUG > 2 and print STDERR "First line is BOM-less.\n"; | ||||
204 | ($line = $source_line) =~ tr/\n\r//d; | ||||
205 | } | ||||
206 | } | ||||
207 | |||||
208 | if(!$self->{'parse_characters'} && !$self->{'encoding'} | ||||
209 | && ($self->{'in_pod'} || $line =~ /^=/s) | ||||
210 | && $line =~ /$non_ascii_re/ | ||||
211 | ) { | ||||
212 | |||||
213 | my $encoding; | ||||
214 | |||||
215 | # No =encoding line, and we are at the first pod line in the input that | ||||
216 | # contains a non-ascii byte, that is, one whose meaning varies depending | ||||
217 | # on whether the file is encoded in UTF-8 or CP1252, which are the two | ||||
218 | # possibilities permitted by the pod spec. (ASCII is assumed if the | ||||
219 | # file only contains ASCII bytes.) In order to process this line, we | ||||
220 | # need to figure out what encoding we will use for the file. | ||||
221 | # | ||||
222 | # Strictly speaking ISO 8859-1 (Latin 1) refers to the code points | ||||
223 | # 160-255, but it is used here, as it often colloquially is, to refer to | ||||
224 | # the complete set of code points 0-255, including ASCII (0-127), the C1 | ||||
225 | # controls (128-159), and strict Latin 1 (160-255). | ||||
226 | # | ||||
227 | # CP1252 is effectively a superset of Latin 1, because it differs only | ||||
228 | # from colloquial 8859-1 in the C1 controls, which are very unlikely to | ||||
229 | # actually be present in 8859-1 files, so can be used for other purposes | ||||
230 | # without conflict. CP 1252 uses most of them for graphic characters. | ||||
231 | # | ||||
232 | # Note that all ASCII-range bytes represent their corresponding code | ||||
233 | # points in both CP1252 and UTF-8. In ASCII platform UTF-8, all other | ||||
234 | # code points require multiple (non-ASCII) bytes to represent. (A | ||||
235 | # separate paragraph for EBCDIC is below.) The multi-byte | ||||
236 | # representation is quite structured. If we find an isolated byte that | ||||
237 | # would require multiple bytes to represent in UTF-8, we know that the | ||||
238 | # encoding is not UTF-8. If we find a sequence of bytes that violates | ||||
239 | # the UTF-8 structure, we also can presume the encoding isn't UTF-8, and | ||||
240 | # hence must be 1252. | ||||
241 | # | ||||
242 | # But there are ambiguous cases where we could guess wrong. If so, the | ||||
243 | # user will end up having to supply an =encoding line. We use all | ||||
244 | # readily available information to improve our chances of guessing | ||||
245 | # right. The odds of something not being UTF-8, but still passing a | ||||
246 | # UTF-8 validity test go down very rapidly with increasing length of the | ||||
247 | # sequence. Therefore we look at all non-ascii sequences on the line. | ||||
248 | # If any of the sequences can't be UTF-8, we quit there and choose | ||||
249 | # CP1252. If all could be UTF-8, we see if any of the code points | ||||
250 | # represented are unlikely to be in pod. If so, we guess CP1252. If | ||||
251 | # not, we check if the line is all in the same script; if not guess | ||||
252 | # CP1252; otherwise UTF-8. For perls that don't have convenient script | ||||
253 | # run testing, see if there is both Latin and non-Latin. If so, CP1252, | ||||
254 | # otherwise UTF-8. | ||||
255 | # | ||||
256 | # On EBCDIC platforms, the situation is somewhat different. In | ||||
257 | # UTF-EBCDIC, not only do ASCII-range bytes represent their code points, | ||||
258 | # but so do the bytes that are for the C1 controls. Recall that these | ||||
259 | # correspond to the unused portion of 8859-1 that 1252 mostly takes | ||||
260 | # over. That means that there are fewer code points that are | ||||
261 | # represented by multi-bytes. But, note that the these controls are | ||||
262 | # very unlikely to be in pod text. So if we encounter one of them, it | ||||
263 | # means that it is quite likely CP1252 and not UTF-8. The net result is | ||||
264 | # the same code below is used for both platforms. | ||||
265 | # | ||||
266 | # XXX probably if the line has E<foo> that evaluates to illegal CP1252, | ||||
267 | # then it is UTF-8. But we haven't processed E<> yet. | ||||
268 | |||||
269 | goto set_1252 if $] lt 5.006_000; # No UTF-8 on very early perls | ||||
270 | |||||
271 | my $copy; | ||||
272 | |||||
273 | 2 | 43µs | 2 | 24µs | # spent 14µs (5+9) within Pod::Simple::BlackBox::BEGIN@273 which was called:
# once (5µs+9µs) by Pod::Simple::LinkSection::BEGIN@7 at line 273 # spent 14µs making 1 call to Pod::Simple::BlackBox::BEGIN@273
# spent 9µs making 1 call to warnings::unimport |
274 | |||||
275 | if ($] ge 5.007_003) { | ||||
276 | $copy = $line; | ||||
277 | |||||
278 | # On perls that have this function, we can use it to easily see if the | ||||
279 | # sequence is valid UTF-8 or not; if valid it turns on the UTF-8 flag | ||||
280 | # needed below for script run detection | ||||
281 | goto set_1252 if ! utf8::decode($copy); | ||||
282 | } | ||||
283 | elsif (ord("A") != 65) { # Early EBCDIC, assume UTF-8. What's a windows | ||||
284 | # code page doing here anyway? | ||||
285 | goto set_utf8; | ||||
286 | } | ||||
287 | else { # ASCII, no decode(): do it ourselves using the fundamental | ||||
288 | # characteristics of UTF-8 | ||||
289 | 2 | 3.77ms | 2 | 17µs | # spent 15µs (11+4) within Pod::Simple::BlackBox::BEGIN@289 which was called:
# once (11µs+4µs) by Pod::Simple::LinkSection::BEGIN@7 at line 289 # spent 15µs making 1 call to Pod::Simple::BlackBox::BEGIN@289
# spent 2µs making 1 call to if::import |
290 | |||||
291 | my $char_ord; | ||||
292 | my $needed; # How many continuation bytes to gobble up | ||||
293 | |||||
294 | # Initialize the translated line with a dummy character that will be | ||||
295 | # deleted after everything else is done. This dummy makes sure that | ||||
296 | # $copy will be in UTF-8. Doing it now avoids the bugs in early perls | ||||
297 | # with upgrading in the middle | ||||
298 | $copy = chr(0x100); | ||||
299 | |||||
300 | # Parse through the line | ||||
301 | for (my $i = 0; $i < length $line; $i++) { | ||||
302 | my $byte = substr($line, $i, 1); | ||||
303 | |||||
304 | # ASCII bytes are trivially dealt with | ||||
305 | if ($byte !~ $non_ascii_re) { | ||||
306 | $copy .= $byte; | ||||
307 | next; | ||||
308 | } | ||||
309 | |||||
310 | my $b_ord = ord $byte; | ||||
311 | |||||
312 | # Now figure out what this code point would be if the input is | ||||
313 | # actually in UTF-8. If, in the process, we discover that it isn't | ||||
314 | # well-formed UTF-8, we guess CP1252. | ||||
315 | # | ||||
316 | # Start the process. If it is UTF-8, we are at the first, start | ||||
317 | # byte, of a multi-byte sequence. We look at this byte to figure | ||||
318 | # out how many continuation bytes are needed, and to initialize the | ||||
319 | # code point accumulator with the data from this byte. | ||||
320 | # | ||||
321 | # Normally the minimum continuation byte is 0x80, but in certain | ||||
322 | # instances the minimum is a higher number. So the code below | ||||
323 | # overrides this for those instances. | ||||
324 | my $min_cont = 0x80; | ||||
325 | |||||
326 | if ($b_ord < 0xC2) { # A start byte < C2 is malformed | ||||
327 | goto set_1252; | ||||
328 | } | ||||
329 | elsif ($b_ord <= 0xDF) { | ||||
330 | $needed = 1; | ||||
331 | $char_ord = $b_ord & 0x1F; | ||||
332 | } | ||||
333 | elsif ($b_ord <= 0xEF) { | ||||
334 | $min_cont = 0xA0 if $b_ord == 0xE0; | ||||
335 | $needed = 2; | ||||
336 | $char_ord = $b_ord & (0x1F >> 1); | ||||
337 | } | ||||
338 | elsif ($b_ord <= 0xF4) { | ||||
339 | $min_cont = 0x90 if $b_ord == 0xF0; | ||||
340 | $needed = 3; | ||||
341 | $char_ord = $b_ord & (0x1F >> 2); | ||||
342 | } | ||||
343 | else { # F4 is the highest start byte for legal Unicode; higher is | ||||
344 | # unlikely to be in pod. | ||||
345 | goto set_1252; | ||||
346 | } | ||||
347 | |||||
348 | # ? not enough continuation bytes available | ||||
349 | goto set_1252 if $i + $needed >= length $line; | ||||
350 | |||||
351 | # Accumulate the ordinal of the character from the remaining | ||||
352 | # (continuation) bytes. | ||||
353 | while ($needed-- > 0) { | ||||
354 | my $cont = substr($line, ++$i, 1); | ||||
355 | $b_ord = ord $cont; | ||||
356 | goto set_1252 if $b_ord < $min_cont || $b_ord > 0xBF; | ||||
357 | |||||
358 | # In all cases, any next continuation bytes all have the same | ||||
359 | # minimum legal value | ||||
360 | $min_cont = 0x80; | ||||
361 | |||||
362 | # Accumulate this byte's contribution to the code point | ||||
363 | $char_ord <<= 6; | ||||
364 | $char_ord |= ($b_ord & 0x3F); | ||||
365 | } | ||||
366 | |||||
367 | # Here, the sequence that formed this code point was valid UTF-8, | ||||
368 | # so add the completed character to the output | ||||
369 | $copy .= chr $char_ord; | ||||
370 | } # End of loop through line | ||||
371 | |||||
372 | # Delete the dummy first character | ||||
373 | $copy = substr($copy, 1); | ||||
374 | } | ||||
375 | |||||
376 | # Here, $copy is legal UTF-8. | ||||
377 | |||||
378 | # If it can't be legal CP1252, no need to look further. (These bytes | ||||
379 | # aren't valid in CP1252.) This test could have been placed higher in | ||||
380 | # the code, but it seemed wrong to set the encoding to UTF-8 without | ||||
381 | # making sure that the very first instance is well-formed. But what if | ||||
382 | # it isn't legal CP1252 either? We have to choose one or the other, and | ||||
383 | # It seems safer to favor the single-byte encoding over the multi-byte. | ||||
384 | goto set_utf8 if ord("A") == 65 && $line =~ /[\x81\x8D\x8F\x90\x9D]/; | ||||
385 | |||||
386 | # The C1 controls are not likely to appear in pod | ||||
387 | goto set_1252 if ord("A") == 65 && $copy =~ /[\x80-\x9F]/; | ||||
388 | |||||
389 | # Nor are surrogates nor unassigned, nor deprecated. | ||||
390 | DEBUG > 8 and print STDERR __LINE__, ": $copy: surrogate\n" if $copy =~ $cs_re; | ||||
391 | goto set_1252 if $cs_re && $copy =~ $cs_re; | ||||
392 | DEBUG > 8 and print STDERR __LINE__, ": $copy: unassigned\n" if $cn_re && $copy =~ $cn_re; | ||||
393 | goto set_1252 if $cn_re && $copy =~ $cn_re; | ||||
394 | DEBUG > 8 and print STDERR __LINE__, ": $copy: deprecated\n" if $copy =~ $deprecated_re; | ||||
395 | goto set_1252 if $copy =~ $deprecated_re; | ||||
396 | |||||
397 | # Nor are rare code points. But this is hard to determine. khw | ||||
398 | # believes that IPA characters and the modifier letters are unlikely to | ||||
399 | # be in pod (and certainly very unlikely to be the in the first line in | ||||
400 | # the pod containing non-ASCII) | ||||
401 | DEBUG > 8 and print STDERR __LINE__, ": $copy: rare\n" if $copy =~ $rare_blocks_re; | ||||
402 | goto set_1252 if $rare_blocks_re && $copy =~ $rare_blocks_re; | ||||
403 | |||||
404 | # The first Unicode version included essentially every Latin character | ||||
405 | # in modern usage. So, a Latin character not in the first release will | ||||
406 | # unlikely be in pod. | ||||
407 | DEBUG > 8 and print STDERR __LINE__, ": $copy: later_latin\n" if $later_latin_re && $copy =~ $later_latin_re; | ||||
408 | goto set_1252 if $later_latin_re && $copy =~ $later_latin_re; | ||||
409 | |||||
410 | # On perls that handle script runs, if the UTF-8 interpretation yields | ||||
411 | # a single script, we guess UTF-8, otherwise just having a mixture of | ||||
412 | # scripts is suspicious, so guess CP1252. We first strip off, as best | ||||
413 | # we can, the ASCII characters that look like they are pod directives, | ||||
414 | # as these would always show as mixed with non-Latin text. | ||||
415 | $copy =~ s/$pod_chars_re//g; | ||||
416 | |||||
417 | if ($script_run_re) { | ||||
418 | goto set_utf8 if $copy =~ $script_run_re; | ||||
419 | DEBUG > 8 and print STDERR __LINE__, ": not script run\n"; | ||||
420 | goto set_1252; | ||||
421 | } | ||||
422 | |||||
423 | # Even without script runs, but on recent enough perls and Unicodes, we | ||||
424 | # can check if there is a mixture of both Latin and non-Latin. Again, | ||||
425 | # having a mixture of scripts is suspicious, so assume CP1252 | ||||
426 | |||||
427 | # If it's all non-Latin, there is no CP1252, as that is Latin | ||||
428 | # characters and punct, etc. | ||||
429 | DEBUG > 8 and print STDERR __LINE__, ": $copy: not latin\n" if $copy !~ $latin_re; | ||||
430 | goto set_utf8 if $copy !~ $latin_re; | ||||
431 | |||||
432 | DEBUG > 8 and print STDERR __LINE__, ": $copy: all latin\n" if $copy =~ $every_char_is_latin_re; | ||||
433 | goto set_utf8 if $copy =~ $every_char_is_latin_re; | ||||
434 | |||||
435 | DEBUG > 8 and print STDERR __LINE__, ": $copy: mixed\n"; | ||||
436 | |||||
437 | set_1252: | ||||
438 | DEBUG > 9 and print STDERR __LINE__, ": $copy: is 1252\n"; | ||||
439 | $encoding = 'CP1252'; | ||||
440 | goto done_set; | ||||
441 | |||||
442 | set_utf8: | ||||
443 | DEBUG > 9 and print STDERR __LINE__, ": $copy: is UTF-8\n"; | ||||
444 | $encoding = 'UTF-8'; | ||||
445 | |||||
446 | done_set: | ||||
447 | $self->_handle_encoding_line( "=encoding $encoding" ); | ||||
448 | delete $self->{'_processed_encoding'}; | ||||
449 | $self->{'_transcoder'} && $self->{'_transcoder'}->($line); | ||||
450 | |||||
451 | my ($word) = $line =~ /(\S*$non_ascii_re\S*)/; | ||||
452 | |||||
453 | $self->whine( | ||||
454 | $self->{'line_count'}, | ||||
455 | "Non-ASCII character seen before =encoding in '$word'. Assuming $encoding" | ||||
456 | ); | ||||
457 | } | ||||
458 | |||||
459 | DEBUG > 5 and print STDERR "# Parsing line: [$line]\n"; | ||||
460 | |||||
461 | if(!$self->{'in_pod'}) { | ||||
462 | if($line =~ m/^=([a-zA-Z][a-zA-Z0-9]*)(?:\s|$)/s) { | ||||
463 | if($1 eq 'cut') { | ||||
464 | $self->scream( | ||||
465 | $self->{'line_count'}, | ||||
466 | "=cut found outside a pod block. Skipping to next block." | ||||
467 | ); | ||||
468 | |||||
469 | ## Before there were errata sections in the world, it was | ||||
470 | ## least-pessimal to abort processing the file. But now we can | ||||
471 | ## just barrel on thru (but still not start a pod block). | ||||
472 | #splice @_; | ||||
473 | #push @_, undef; | ||||
474 | |||||
475 | next; | ||||
476 | } else { | ||||
477 | $self->{'in_pod'} = $self->{'start_of_pod_block'} | ||||
478 | = $self->{'last_was_blank'} = 1; | ||||
479 | # And fall thru to the pod-mode block further down | ||||
480 | } | ||||
481 | } else { | ||||
482 | DEBUG > 5 and print STDERR "# It's a code-line.\n"; | ||||
483 | $code_handler->(map $_, $line, $self->{'line_count'}, $self) | ||||
484 | if $code_handler; | ||||
485 | # Note: this may cause code to be processed out of order relative | ||||
486 | # to pods, but in order relative to cuts. | ||||
487 | |||||
488 | # Note also that we haven't yet applied the transcoding to $line | ||||
489 | # by time we call $code_handler! | ||||
490 | |||||
491 | if( $line =~ m/^#\s*line\s+(\d+)\s*(?:\s"([^"]+)")?\s*$/ ) { | ||||
492 | # That RE is from perlsyn, section "Plain Old Comments (Not!)", | ||||
493 | #$fname = $2 if defined $2; | ||||
494 | #DEBUG > 1 and defined $2 and print STDERR "# Setting fname to \"$fname\"\n"; | ||||
495 | DEBUG > 1 and print STDERR "# Setting nextline to $1\n"; | ||||
496 | $self->{'line_count'} = $1 - 1; | ||||
497 | } | ||||
498 | |||||
499 | next; | ||||
500 | } | ||||
501 | } | ||||
502 | |||||
503 | # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | ||||
504 | # Else we're in pod mode: | ||||
505 | |||||
506 | # Apply any necessary transcoding: | ||||
507 | $self->{'_transcoder'} && $self->{'_transcoder'}->($line); | ||||
508 | |||||
509 | # HERE WE CATCH =encoding EARLY! | ||||
510 | if( $line =~ m/^=encoding\s+\S+\s*$/s ) { | ||||
511 | next if $self->parse_characters; # Ignore this line | ||||
512 | $line = $self->_handle_encoding_line( $line ); | ||||
513 | } | ||||
514 | |||||
515 | if($line =~ m/^=cut/s) { | ||||
516 | # here ends the pod block, and therefore the previous pod para | ||||
517 | DEBUG > 1 and print STDERR "Noting =cut at line ${$self}{'line_count'}\n"; | ||||
518 | $self->{'in_pod'} = 0; | ||||
519 | # ++$self->{'pod_para_count'}; | ||||
520 | $self->_ponder_paragraph_buffer(); | ||||
521 | # by now it's safe to consider the previous paragraph as done. | ||||
522 | DEBUG > 6 and print STDERR "Processing any cut handler, line ${$self}{'line_count'}\n"; | ||||
523 | $cut_handler->(map $_, $line, $self->{'line_count'}, $self) | ||||
524 | if $cut_handler; | ||||
525 | |||||
526 | # TODO: add to docs: Note: this may cause cuts to be processed out | ||||
527 | # of order relative to pods, but in order relative to code. | ||||
528 | |||||
529 | } elsif($line =~ m/^(\s*)$/s) { # it's a blank line | ||||
530 | if (defined $1 and $1 =~ /[^\S\r\n]/) { # it's a white line | ||||
531 | $wl_handler->(map $_, $line, $self->{'line_count'}, $self) | ||||
532 | if $wl_handler; | ||||
533 | } | ||||
534 | |||||
535 | if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') { | ||||
536 | DEBUG > 1 and print STDERR "Saving blank line at line ${$self}{'line_count'}\n"; | ||||
537 | push @{$paras->[-1]}, $line; | ||||
538 | } # otherwise it's not interesting | ||||
539 | |||||
540 | if(!$self->{'start_of_pod_block'} and !$self->{'last_was_blank'}) { | ||||
541 | DEBUG > 1 and print STDERR "Noting para ends with blank line at ${$self}{'line_count'}\n"; | ||||
542 | } | ||||
543 | |||||
544 | $self->{'last_was_blank'} = 1; | ||||
545 | |||||
546 | } elsif($self->{'last_was_blank'}) { # A non-blank line starting a new para... | ||||
547 | |||||
548 | if($line =~ m/^(=[a-zA-Z][a-zA-Z0-9]*)(\s+|$)(.*)/s) { | ||||
549 | # THIS IS THE ONE PLACE WHERE WE CONSTRUCT NEW DIRECTIVE OBJECTS | ||||
550 | my $new = [$1, {'start_line' => $self->{'line_count'}}, $3]; | ||||
551 | $new->[1]{'~orig_spacer'} = $2 if $2 && $2 ne " "; | ||||
552 | # Note that in "=head1 foo", the WS is lost. | ||||
553 | # Example: ['=head1', {'start_line' => 123}, ' foo'] | ||||
554 | |||||
555 | ++$self->{'pod_para_count'}; | ||||
556 | |||||
557 | $self->_ponder_paragraph_buffer(); | ||||
558 | # by now it's safe to consider the previous paragraph as done. | ||||
559 | |||||
560 | push @$paras, $new; # the new incipient paragraph | ||||
561 | DEBUG > 1 and print STDERR "Starting new ${$paras}[-1][0] para at line ${$self}{'line_count'}\n"; | ||||
562 | |||||
563 | } elsif($line =~ m/^\s/s) { | ||||
564 | |||||
565 | if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') { | ||||
566 | DEBUG > 1 and print STDERR "Resuming verbatim para at line ${$self}{'line_count'}\n"; | ||||
567 | push @{$paras->[-1]}, $line; | ||||
568 | } else { | ||||
569 | ++$self->{'pod_para_count'}; | ||||
570 | $self->_ponder_paragraph_buffer(); | ||||
571 | # by now it's safe to consider the previous paragraph as done. | ||||
572 | DEBUG > 1 and print STDERR "Starting verbatim para at line ${$self}{'line_count'}\n"; | ||||
573 | push @$paras, ['~Verbatim', {'start_line' => $self->{'line_count'}}, $line]; | ||||
574 | } | ||||
575 | } else { | ||||
576 | ++$self->{'pod_para_count'}; | ||||
577 | $self->_ponder_paragraph_buffer(); | ||||
578 | # by now it's safe to consider the previous paragraph as done. | ||||
579 | push @$paras, ['~Para', {'start_line' => $self->{'line_count'}}, $line]; | ||||
580 | DEBUG > 1 and print STDERR "Starting plain para at line ${$self}{'line_count'}\n"; | ||||
581 | } | ||||
582 | $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0; | ||||
583 | |||||
584 | } else { | ||||
585 | # It's a non-blank line /continuing/ the current para | ||||
586 | if(@$paras) { | ||||
587 | DEBUG > 2 and print STDERR "Line ${$self}{'line_count'} continues current paragraph\n"; | ||||
588 | push @{$paras->[-1]}, $line; | ||||
589 | } else { | ||||
590 | # Unexpected case! | ||||
591 | die "Continuing a paragraph but \@\$paras is empty?"; | ||||
592 | } | ||||
593 | $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0; | ||||
594 | } | ||||
595 | |||||
596 | } # ends the big while loop | ||||
597 | |||||
598 | DEBUG > 1 and print STDERR (pretty(@$paras), "\n"); | ||||
599 | return $self; | ||||
600 | } | ||||
601 | |||||
602 | #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | ||||
603 | |||||
604 | sub _handle_encoding_line { | ||||
605 | my($self, $line) = @_; | ||||
606 | |||||
607 | return if $self->parse_characters; | ||||
608 | |||||
609 | # The point of this routine is to set $self->{'_transcoder'} as indicated. | ||||
610 | |||||
611 | return $line unless $line =~ m/^=encoding\s+(\S+)\s*$/s; | ||||
612 | DEBUG > 1 and print STDERR "Found an encoding line \"=encoding $1\"\n"; | ||||
613 | |||||
614 | my $e = $1; | ||||
615 | my $orig = $e; | ||||
616 | push @{ $self->{'encoding_command_reqs'} }, "=encoding $orig"; | ||||
617 | |||||
618 | my $enc_error; | ||||
619 | |||||
620 | # Cf. perldoc Encode and perldoc Encode::Supported | ||||
621 | |||||
622 | require Pod::Simple::Transcode; | ||||
623 | |||||
624 | if( $self->{'encoding'} ) { | ||||
625 | my $norm_current = $self->{'encoding'}; | ||||
626 | my $norm_e = $e; | ||||
627 | foreach my $that ($norm_current, $norm_e) { | ||||
628 | $that = lc($that); | ||||
629 | $that =~ s/[-_]//g; | ||||
630 | } | ||||
631 | if($norm_current eq $norm_e) { | ||||
632 | DEBUG > 1 and print STDERR "The '=encoding $orig' line is ", | ||||
633 | "redundant. ($norm_current eq $norm_e). Ignoring.\n"; | ||||
634 | $enc_error = ''; | ||||
635 | # But that doesn't necessarily mean that the earlier one went okay | ||||
636 | } else { | ||||
637 | $enc_error = "Encoding is already set to " . $self->{'encoding'}; | ||||
638 | DEBUG > 1 and print STDERR $enc_error; | ||||
639 | } | ||||
640 | } elsif ( | ||||
641 | # OK, let's turn on the encoding | ||||
642 | do { | ||||
643 | DEBUG > 1 and print STDERR " Setting encoding to $e\n"; | ||||
644 | $self->{'encoding'} = $e; | ||||
645 | 1; | ||||
646 | } | ||||
647 | and $e eq 'HACKRAW' | ||||
648 | ) { | ||||
649 | DEBUG and print STDERR " Putting in HACKRAW (no-op) encoding mode.\n"; | ||||
650 | |||||
651 | } elsif( Pod::Simple::Transcode::->encoding_is_available($e) ) { | ||||
652 | |||||
653 | die($enc_error = "WHAT? _transcoder is already set?!") | ||||
654 | if $self->{'_transcoder'}; # should never happen | ||||
655 | require Pod::Simple::Transcode; | ||||
656 | $self->{'_transcoder'} = Pod::Simple::Transcode::->make_transcoder($e); | ||||
657 | eval { | ||||
658 | my @x = ('', "abc", "123"); | ||||
659 | $self->{'_transcoder'}->(@x); | ||||
660 | }; | ||||
661 | $@ && die( $enc_error = | ||||
662 | "Really unexpected error setting up encoding $e: $@\nAborting" | ||||
663 | ); | ||||
664 | $self->{'detected_encoding'} = $e; | ||||
665 | |||||
666 | } else { | ||||
667 | my @supported = Pod::Simple::Transcode::->all_encodings; | ||||
668 | |||||
669 | # Note unsupported, and complain | ||||
670 | DEBUG and print STDERR " Encoding [$e] is unsupported.", | ||||
671 | "\nSupporteds: @supported\n"; | ||||
672 | my $suggestion = ''; | ||||
673 | |||||
674 | # Look for a near match: | ||||
675 | my $norm = lc($e); | ||||
676 | $norm =~ tr[-_][]d; | ||||
677 | my $n; | ||||
678 | foreach my $enc (@supported) { | ||||
679 | $n = lc($enc); | ||||
680 | $n =~ tr[-_][]d; | ||||
681 | next unless $n eq $norm; | ||||
682 | $suggestion = " (Maybe \"$e\" should be \"$enc\"?)"; | ||||
683 | last; | ||||
684 | } | ||||
685 | my $encmodver = Pod::Simple::Transcode::->encmodver; | ||||
686 | $enc_error = join '' => | ||||
687 | "This document probably does not appear as it should, because its ", | ||||
688 | "\"=encoding $e\" line calls for an unsupported encoding.", | ||||
689 | $suggestion, " [$encmodver\'s supported encodings are: @supported]" | ||||
690 | ; | ||||
691 | |||||
692 | $self->scream( $self->{'line_count'}, $enc_error ); | ||||
693 | } | ||||
694 | push @{ $self->{'encoding_command_statuses'} }, $enc_error; | ||||
695 | if (defined($self->{'_processed_encoding'})) { | ||||
696 | # Double declaration. | ||||
697 | $self->scream( $self->{'line_count'}, 'Cannot have multiple =encoding directives'); | ||||
698 | } | ||||
699 | $self->{'_processed_encoding'} = $orig; | ||||
700 | |||||
701 | return $line; | ||||
702 | } | ||||
703 | |||||
704 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||||
705 | |||||
706 | sub _handle_encoding_second_level { | ||||
707 | # By time this is called, the encoding (if well formed) will already | ||||
708 | # have been acted on. | ||||
709 | my($self, $para) = @_; | ||||
710 | my @x = @$para; | ||||
711 | my $content = join ' ', splice @x, 2; | ||||
712 | $content =~ s/^\s+//s; | ||||
713 | $content =~ s/\s+$//s; | ||||
714 | |||||
715 | DEBUG > 2 and print STDERR "Ogling encoding directive: =encoding $content\n"; | ||||
716 | |||||
717 | if (defined($self->{'_processed_encoding'})) { | ||||
718 | #if($content ne $self->{'_processed_encoding'}) { | ||||
719 | # Could it happen? | ||||
720 | #} | ||||
721 | delete $self->{'_processed_encoding'}; | ||||
722 | # It's already been handled. Check for errors. | ||||
723 | if(! $self->{'encoding_command_statuses'} ) { | ||||
724 | DEBUG > 2 and print STDERR " CRAZY ERROR: It wasn't really handled?!\n"; | ||||
725 | } elsif( $self->{'encoding_command_statuses'}[-1] ) { | ||||
726 | $self->whine( $para->[1]{'start_line'}, | ||||
727 | sprintf "Couldn't do %s: %s", | ||||
728 | $self->{'encoding_command_reqs' }[-1], | ||||
729 | $self->{'encoding_command_statuses'}[-1], | ||||
730 | ); | ||||
731 | } else { | ||||
732 | DEBUG > 2 and print STDERR " (Yup, it was successfully handled already.)\n"; | ||||
733 | } | ||||
734 | |||||
735 | } else { | ||||
736 | # Otherwise it's a syntax error | ||||
737 | $self->whine( $para->[1]{'start_line'}, | ||||
738 | "Invalid =encoding syntax: $content" | ||||
739 | ); | ||||
740 | } | ||||
741 | |||||
742 | return; | ||||
743 | } | ||||
744 | |||||
745 | #~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~` | ||||
746 | |||||
747 | { | ||||
748 | 1 | 200ns | my $m = -321; # magic line number | ||
749 | |||||
750 | sub _gen_errata { | ||||
751 | my $self = $_[0]; | ||||
752 | # Return 0 or more fake-o paragraphs explaining the accumulated | ||||
753 | # errors on this document. | ||||
754 | |||||
755 | return() unless $self->{'errata'} and keys %{$self->{'errata'}}; | ||||
756 | |||||
757 | my @out; | ||||
758 | |||||
759 | foreach my $line (sort {$a <=> $b} keys %{$self->{'errata'}}) { | ||||
760 | push @out, | ||||
761 | ['=item', {'start_line' => $m}, "Around line $line:"], | ||||
762 | map( ['~Para', {'start_line' => $m, '~cooked' => 1}, | ||||
763 | #['~Top', {'start_line' => $m}, | ||||
764 | $_ | ||||
765 | #] | ||||
766 | ], | ||||
767 | @{$self->{'errata'}{$line}} | ||||
768 | ) | ||||
769 | ; | ||||
770 | } | ||||
771 | |||||
772 | # TODO: report of unknown entities? unrenderable characters? | ||||
773 | |||||
774 | unshift @out, | ||||
775 | ['=head1', {'start_line' => $m, 'errata' => 1}, 'POD ERRORS'], | ||||
776 | ['~Para', {'start_line' => $m, '~cooked' => 1, 'errata' => 1}, | ||||
777 | "Hey! ", | ||||
778 | ['B', {}, | ||||
779 | 'The above document had some coding errors, which are explained below:' | ||||
780 | ] | ||||
781 | ], | ||||
782 | ['=over', {'start_line' => $m, 'errata' => 1}, ''], | ||||
783 | ; | ||||
784 | |||||
785 | push @out, | ||||
786 | ['=back', {'start_line' => $m, 'errata' => 1}, ''], | ||||
787 | ; | ||||
788 | |||||
789 | DEBUG and print STDERR "\n<<\n", pretty(\@out), "\n>>\n\n"; | ||||
790 | |||||
791 | return @out; | ||||
792 | } | ||||
793 | |||||
794 | } | ||||
795 | |||||
796 | #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | ||||
797 | |||||
798 | ############################################################################## | ||||
799 | ## | ||||
800 | ## stop reading now stop reading now stop reading now stop reading now stop | ||||
801 | ## | ||||
802 | ## HERE IT BECOMES REALLY SCARY | ||||
803 | ## | ||||
804 | ## stop reading now stop reading now stop reading now stop reading now stop | ||||
805 | ## | ||||
806 | ############################################################################## | ||||
807 | |||||
808 | 1 | 300ns | sub _ponder_paragraph_buffer { | ||
809 | |||||
810 | # Para-token types as found in the buffer. | ||||
811 | # ~Verbatim, ~Para, ~end, =head1..4, =for, =begin, =end, | ||||
812 | # =over, =back, =item | ||||
813 | # and the null =pod (to be complained about if over one line) | ||||
814 | # | ||||
815 | # "~data" paragraphs are something we generate at this level, depending on | ||||
816 | # a currently open =over region | ||||
817 | |||||
818 | # Events fired: Begin and end for: | ||||
819 | # directivename (like head1 .. head4), item, extend, | ||||
820 | # for (from =begin...=end, =for), | ||||
821 | # over-bullet, over-number, over-text, over-block, | ||||
822 | # item-bullet, item-number, item-text, | ||||
823 | # Document, | ||||
824 | # Data, Para, Verbatim | ||||
825 | # B, C, longdirname (TODO -- wha?), etc. for all directives | ||||
826 | # | ||||
827 | |||||
828 | my $self = $_[0]; | ||||
829 | my $paras; | ||||
830 | return unless @{$paras = $self->{'paras'}}; | ||||
831 | my $curr_open = ($self->{'curr_open'} ||= []); | ||||
832 | |||||
833 | my $scratch; | ||||
834 | |||||
835 | DEBUG > 10 and print STDERR "# Paragraph buffer: <<", pretty($paras), ">>\n"; | ||||
836 | |||||
837 | # We have something in our buffer. So apparently the document has started. | ||||
838 | unless($self->{'doc_has_started'}) { | ||||
839 | $self->{'doc_has_started'} = 1; | ||||
840 | |||||
841 | my $starting_contentless; | ||||
842 | $starting_contentless = | ||||
843 | ( | ||||
844 | !@$curr_open | ||||
845 | and @$paras and ! grep $_->[0] ne '~end', @$paras | ||||
846 | # i.e., if the paras is all ~ends | ||||
847 | ) | ||||
848 | ; | ||||
849 | DEBUG and print STDERR "# Starting ", | ||||
850 | $starting_contentless ? 'contentless' : 'contentful', | ||||
851 | " document\n" | ||||
852 | ; | ||||
853 | |||||
854 | $self->_handle_element_start( | ||||
855 | ($scratch = 'Document'), | ||||
856 | { | ||||
857 | 'start_line' => $paras->[0][1]{'start_line'}, | ||||
858 | $starting_contentless ? ( 'contentless' => 1 ) : (), | ||||
859 | }, | ||||
860 | ); | ||||
861 | } | ||||
862 | |||||
863 | my($para, $para_type); | ||||
864 | while(@$paras) { | ||||
865 | |||||
866 | # If a directive, assume it's legal; subtract below if found not to be | ||||
867 | $seen_legal_directive++ if $paras->[0][0] =~ /^=/; | ||||
868 | |||||
869 | last if @$paras == 1 | ||||
870 | and ( $paras->[0][0] eq '=over' | ||||
871 | or $paras->[0][0] eq '=item' | ||||
872 | or ($paras->[0][0] eq '~Verbatim' and $self->{'in_pod'})); | ||||
873 | # Those're the three kinds of paragraphs that require lookahead. | ||||
874 | # Actually, an "=item Foo" inside an <over type=text> region | ||||
875 | # and any =item inside an <over type=block> region (rare) | ||||
876 | # don't require any lookahead, but all others (bullets | ||||
877 | # and numbers) do. | ||||
878 | # The verbatim is different from the other two, because those might be | ||||
879 | # like: | ||||
880 | # | ||||
881 | # =item | ||||
882 | # ... | ||||
883 | # =cut | ||||
884 | # ... | ||||
885 | # =item | ||||
886 | # | ||||
887 | # The =cut here finishes the paragraph but doesn't terminate the =over | ||||
888 | # they should be in. (khw apologizes that he didn't comment at the time | ||||
889 | # why the 'in_pod' works, and no longer remembers why, and doesn't think | ||||
890 | # it is currently worth the effort to re-figure it out.) | ||||
891 | |||||
892 | # TODO: whinge about many kinds of directives in non-resolving =for regions? | ||||
893 | # TODO: many? like what? =head1 etc? | ||||
894 | |||||
895 | $para = shift @$paras; | ||||
896 | $para_type = $para->[0]; | ||||
897 | |||||
898 | DEBUG > 1 and print STDERR "Pondering a $para_type paragraph, given the stack: (", | ||||
899 | $self->_dump_curr_open(), ")\n"; | ||||
900 | |||||
901 | if($para_type eq '=for') { | ||||
902 | next if $self->_ponder_for($para,$curr_open,$paras); | ||||
903 | |||||
904 | } elsif($para_type eq '=begin') { | ||||
905 | next if $self->_ponder_begin($para,$curr_open,$paras); | ||||
906 | |||||
907 | } elsif($para_type eq '=end') { | ||||
908 | next if $self->_ponder_end($para,$curr_open,$paras); | ||||
909 | |||||
910 | } elsif($para_type eq '~end') { # The virtual end-document signal | ||||
911 | next if $self->_ponder_doc_end($para,$curr_open,$paras); | ||||
912 | } | ||||
913 | |||||
914 | |||||
915 | # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ | ||||
916 | #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ | ||||
917 | if(grep $_->[1]{'~ignore'}, @$curr_open) { | ||||
918 | DEBUG > 1 and | ||||
919 | print STDERR "Skipping $para_type paragraph because in ignore mode.\n"; | ||||
920 | next; | ||||
921 | } | ||||
922 | #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ | ||||
923 | # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ | ||||
924 | |||||
925 | if($para_type eq '=pod') { | ||||
926 | $self->_ponder_pod($para,$curr_open,$paras); | ||||
927 | |||||
928 | } elsif($para_type eq '=over') { | ||||
929 | next if $self->_ponder_over($para,$curr_open,$paras); | ||||
930 | |||||
931 | } elsif($para_type eq '=back') { | ||||
932 | next if $self->_ponder_back($para,$curr_open,$paras); | ||||
933 | |||||
934 | } else { | ||||
935 | |||||
936 | # All non-magical codes!!! | ||||
937 | |||||
938 | # Here we start using $para_type for our own twisted purposes, to | ||||
939 | # mean how it should get treated, not as what the element name | ||||
940 | # should be. | ||||
941 | |||||
942 | DEBUG > 1 and print STDERR "Pondering non-magical $para_type\n"; | ||||
943 | |||||
944 | my $i; | ||||
945 | |||||
946 | # Enforce some =headN discipline | ||||
947 | if($para_type =~ m/^=head\d$/s | ||||
948 | and ! $self->{'accept_heads_anywhere'} | ||||
949 | and @$curr_open | ||||
950 | and $curr_open->[-1][0] eq '=over' | ||||
951 | ) { | ||||
952 | DEBUG > 2 and print STDERR "'=$para_type' inside an '=over'!\n"; | ||||
953 | $self->whine( | ||||
954 | $para->[1]{'start_line'}, | ||||
955 | "You forgot a '=back' before '$para_type'" | ||||
956 | ); | ||||
957 | unshift @$paras, ['=back', {}, ''], $para; # close the =over | ||||
958 | next; | ||||
959 | } | ||||
960 | |||||
961 | |||||
962 | if($para_type eq '=item') { | ||||
963 | |||||
964 | my $over; | ||||
965 | unless(@$curr_open and | ||||
966 | $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) { | ||||
967 | $self->whine( | ||||
968 | $para->[1]{'start_line'}, | ||||
969 | "'=item' outside of any '=over'" | ||||
970 | ); | ||||
971 | unshift @$paras, | ||||
972 | ['=over', {'start_line' => $para->[1]{'start_line'}}, ''], | ||||
973 | $para | ||||
974 | ; | ||||
975 | next; | ||||
976 | } | ||||
977 | |||||
978 | |||||
979 | my $over_type = $over->[1]{'~type'}; | ||||
980 | |||||
981 | if(!$over_type) { | ||||
982 | # Shouldn't happen1 | ||||
983 | die "Typeless over in stack, starting at line " | ||||
984 | . $over->[1]{'start_line'}; | ||||
985 | |||||
986 | } elsif($over_type eq 'block') { | ||||
987 | unless($curr_open->[-1][1]{'~bitched_about'}) { | ||||
988 | $curr_open->[-1][1]{'~bitched_about'} = 1; | ||||
989 | $self->whine( | ||||
990 | $curr_open->[-1][1]{'start_line'}, | ||||
991 | "You can't have =items (as at line " | ||||
992 | . $para->[1]{'start_line'} | ||||
993 | . ") unless the first thing after the =over is an =item" | ||||
994 | ); | ||||
995 | } | ||||
996 | # Just turn it into a paragraph and reconsider it | ||||
997 | $para->[0] = '~Para'; | ||||
998 | unshift @$paras, $para; | ||||
999 | next; | ||||
1000 | |||||
1001 | } elsif($over_type eq 'text') { | ||||
1002 | my $item_type = $self->_get_item_type($para); | ||||
1003 | # That kills the content of the item if it's a number or bullet. | ||||
1004 | DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; | ||||
1005 | |||||
1006 | if($item_type eq 'text') { | ||||
1007 | # Nothing special needs doing for 'text' | ||||
1008 | } elsif($item_type eq 'number' or $item_type eq 'bullet') { | ||||
1009 | $self->whine( | ||||
1010 | $para->[1]{'start_line'}, | ||||
1011 | "Expected text after =item, not a $item_type" | ||||
1012 | ); | ||||
1013 | # Undo our clobbering: | ||||
1014 | push @$para, $para->[1]{'~orig_content'}; | ||||
1015 | delete $para->[1]{'number'}; | ||||
1016 | # Only a PROPER item-number element is allowed | ||||
1017 | # to have a number attribute. | ||||
1018 | } else { | ||||
1019 | die "Unhandled item type $item_type"; # should never happen | ||||
1020 | } | ||||
1021 | |||||
1022 | # =item-text thingies don't need any assimilation, it seems. | ||||
1023 | |||||
1024 | } elsif($over_type eq 'number') { | ||||
1025 | my $item_type = $self->_get_item_type($para); | ||||
1026 | # That kills the content of the item if it's a number or bullet. | ||||
1027 | DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; | ||||
1028 | |||||
1029 | my $expected_value = ++ $curr_open->[-1][1]{'~counter'}; | ||||
1030 | |||||
1031 | if($item_type eq 'bullet') { | ||||
1032 | # Hm, it's not numeric. Correct for this. | ||||
1033 | $para->[1]{'number'} = $expected_value; | ||||
1034 | $self->whine( | ||||
1035 | $para->[1]{'start_line'}, | ||||
1036 | "Expected '=item $expected_value'" | ||||
1037 | ); | ||||
1038 | push @$para, $para->[1]{'~orig_content'}; | ||||
1039 | # restore the bullet, blocking the assimilation of next para | ||||
1040 | |||||
1041 | } elsif($item_type eq 'text') { | ||||
1042 | # Hm, it's not numeric. Correct for this. | ||||
1043 | $para->[1]{'number'} = $expected_value; | ||||
1044 | $self->whine( | ||||
1045 | $para->[1]{'start_line'}, | ||||
1046 | "Expected '=item $expected_value'" | ||||
1047 | ); | ||||
1048 | # Text content will still be there and will block next ~Para | ||||
1049 | |||||
1050 | } elsif($item_type ne 'number') { | ||||
1051 | die "Unknown item type $item_type"; # should never happen | ||||
1052 | |||||
1053 | } elsif($expected_value == $para->[1]{'number'}) { | ||||
1054 | DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n"; | ||||
1055 | |||||
1056 | } else { | ||||
1057 | DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'}, | ||||
1058 | " instead of the expected value of $expected_value\n"; | ||||
1059 | $self->whine( | ||||
1060 | $para->[1]{'start_line'}, | ||||
1061 | "You have '=item " . $para->[1]{'number'} . | ||||
1062 | "' instead of the expected '=item $expected_value'" | ||||
1063 | ); | ||||
1064 | $para->[1]{'number'} = $expected_value; # correcting!! | ||||
1065 | } | ||||
1066 | |||||
1067 | if(@$para == 2) { | ||||
1068 | # For the cases where we /didn't/ push to @$para | ||||
1069 | if($paras->[0][0] eq '~Para') { | ||||
1070 | DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n"; | ||||
1071 | push @$para, splice @{shift @$paras},2; | ||||
1072 | } else { | ||||
1073 | DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n"; | ||||
1074 | push @$para, ''; # Just so it's not contentless | ||||
1075 | } | ||||
1076 | } | ||||
1077 | |||||
1078 | |||||
1079 | } elsif($over_type eq 'bullet') { | ||||
1080 | my $item_type = $self->_get_item_type($para); | ||||
1081 | # That kills the content of the item if it's a number or bullet. | ||||
1082 | DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; | ||||
1083 | |||||
1084 | if($item_type eq 'bullet') { | ||||
1085 | # as expected! | ||||
1086 | |||||
1087 | if( $para->[1]{'~_freaky_para_hack'} ) { | ||||
1088 | DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n"; | ||||
1089 | push @$para, $para->[1]{'~_freaky_para_hack'}; | ||||
1090 | } | ||||
1091 | |||||
1092 | } elsif($item_type eq 'number') { | ||||
1093 | $self->whine( | ||||
1094 | $para->[1]{'start_line'}, | ||||
1095 | "Expected '=item *'" | ||||
1096 | ); | ||||
1097 | push @$para, $para->[1]{'~orig_content'}; | ||||
1098 | # and block assimilation of the next paragraph | ||||
1099 | delete $para->[1]{'number'}; | ||||
1100 | # Only a PROPER item-number element is allowed | ||||
1101 | # to have a number attribute. | ||||
1102 | } elsif($item_type eq 'text') { | ||||
1103 | $self->whine( | ||||
1104 | $para->[1]{'start_line'}, | ||||
1105 | "Expected '=item *'" | ||||
1106 | ); | ||||
1107 | # But doesn't need processing. But it'll block assimilation | ||||
1108 | # of the next para. | ||||
1109 | } else { | ||||
1110 | die "Unhandled item type $item_type"; # should never happen | ||||
1111 | } | ||||
1112 | |||||
1113 | if(@$para == 2) { | ||||
1114 | # For the cases where we /didn't/ push to @$para | ||||
1115 | if($paras->[0][0] eq '~Para') { | ||||
1116 | DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n"; | ||||
1117 | push @$para, splice @{shift @$paras},2; | ||||
1118 | } else { | ||||
1119 | DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n"; | ||||
1120 | push @$para, ''; # Just so it's not contentless | ||||
1121 | } | ||||
1122 | } | ||||
1123 | |||||
1124 | } else { | ||||
1125 | die "Unhandled =over type \"$over_type\"?"; | ||||
1126 | # Shouldn't happen! | ||||
1127 | } | ||||
1128 | |||||
1129 | $para_type = 'Plain'; | ||||
1130 | $para->[0] .= '-' . $over_type; | ||||
1131 | # Whew. Now fall thru and process it. | ||||
1132 | |||||
1133 | |||||
1134 | } elsif($para_type eq '=extend') { | ||||
1135 | # Well, might as well implement it here. | ||||
1136 | $self->_ponder_extend($para); | ||||
1137 | next; # and skip | ||||
1138 | } elsif($para_type eq '=encoding') { | ||||
1139 | # Not actually acted on here, but we catch errors here. | ||||
1140 | $self->_handle_encoding_second_level($para); | ||||
1141 | next unless $self->keep_encoding_directive; | ||||
1142 | $para_type = 'Plain'; | ||||
1143 | } elsif($para_type eq '~Verbatim') { | ||||
1144 | $para->[0] = 'Verbatim'; | ||||
1145 | $para_type = '?Verbatim'; | ||||
1146 | } elsif($para_type eq '~Para') { | ||||
1147 | $para->[0] = 'Para'; | ||||
1148 | $para_type = '?Plain'; | ||||
1149 | } elsif($para_type eq 'Data') { | ||||
1150 | $para->[0] = 'Data'; | ||||
1151 | $para_type = '?Data'; | ||||
1152 | } elsif( $para_type =~ s/^=//s | ||||
1153 | and defined( $para_type = $self->{'accept_directives'}{$para_type} ) | ||||
1154 | ) { | ||||
1155 | DEBUG > 1 and print STDERR " Pondering known directive ${$para}[0] as $para_type\n"; | ||||
1156 | } else { | ||||
1157 | # An unknown directive! | ||||
1158 | $seen_legal_directive--; | ||||
1159 | DEBUG > 1 and printf STDERR "Unhandled directive %s (Handled: %s)\n", | ||||
1160 | $para->[0], join(' ', sort keys %{$self->{'accept_directives'}} ) | ||||
1161 | ; | ||||
1162 | $self->whine( | ||||
1163 | $para->[1]{'start_line'}, | ||||
1164 | "Unknown directive: $para->[0]" | ||||
1165 | ); | ||||
1166 | |||||
1167 | # And maybe treat it as text instead of just letting it go? | ||||
1168 | next; | ||||
1169 | } | ||||
1170 | |||||
1171 | if($para_type =~ s/^\?//s) { | ||||
1172 | if(! @$curr_open) { # usual case | ||||
1173 | DEBUG and print STDERR "Treating $para_type paragraph as such because stack is empty.\n"; | ||||
1174 | } else { | ||||
1175 | my @fors = grep $_->[0] eq '=for', @$curr_open; | ||||
1176 | DEBUG > 1 and print STDERR "Containing fors: ", | ||||
1177 | join(',', map $_->[1]{'target'}, @fors), "\n"; | ||||
1178 | |||||
1179 | if(! @fors) { | ||||
1180 | DEBUG and print STDERR "Treating $para_type paragraph as such because stack has no =for's\n"; | ||||
1181 | |||||
1182 | #} elsif(grep $_->[1]{'~resolve'}, @fors) { | ||||
1183 | #} elsif(not grep !$_->[1]{'~resolve'}, @fors) { | ||||
1184 | } elsif( $fors[-1][1]{'~resolve'} ) { | ||||
1185 | # Look to the immediately containing for | ||||
1186 | |||||
1187 | if($para_type eq 'Data') { | ||||
1188 | DEBUG and print STDERR "Treating Data paragraph as Plain/Verbatim because the containing =for ($fors[-1][1]{'target'}) is a resolver\n"; | ||||
1189 | $para->[0] = 'Para'; | ||||
1190 | $para_type = 'Plain'; | ||||
1191 | } else { | ||||
1192 | DEBUG and print STDERR "Treating $para_type paragraph as such because the containing =for ($fors[-1][1]{'target'}) is a resolver\n"; | ||||
1193 | } | ||||
1194 | } else { | ||||
1195 | DEBUG and print STDERR "Treating $para_type paragraph as Data because the containing =for ($fors[-1][1]{'target'}) is a non-resolver\n"; | ||||
1196 | $para->[0] = $para_type = 'Data'; | ||||
1197 | } | ||||
1198 | } | ||||
1199 | } | ||||
1200 | |||||
1201 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
1202 | if($para_type eq 'Plain') { | ||||
1203 | $self->_ponder_Plain($para); | ||||
1204 | } elsif($para_type eq 'Verbatim') { | ||||
1205 | $self->_ponder_Verbatim($para); | ||||
1206 | } elsif($para_type eq 'Data') { | ||||
1207 | $self->_ponder_Data($para); | ||||
1208 | } else { | ||||
1209 | die "\$para type is $para_type -- how did that happen?"; | ||||
1210 | # Shouldn't happen. | ||||
1211 | } | ||||
1212 | |||||
1213 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||||
1214 | $para->[0] =~ s/^[~=]//s; | ||||
1215 | |||||
1216 | DEBUG and print STDERR "\n", pretty($para), "\n"; | ||||
1217 | |||||
1218 | # traverse the treelet (which might well be just one string scalar) | ||||
1219 | $self->{'content_seen'} ||= 1 if $seen_legal_directive | ||||
1220 | && ! $self->{'~tried_gen_errata'}; | ||||
1221 | $self->_traverse_treelet_bit(@$para); | ||||
1222 | } | ||||
1223 | } | ||||
1224 | |||||
1225 | return; | ||||
1226 | } | ||||
1227 | |||||
1228 | ########################################################################### | ||||
1229 | # The sub-ponderers... | ||||
1230 | |||||
- - | |||||
1233 | sub _ponder_for { | ||||
1234 | my ($self,$para,$curr_open,$paras) = @_; | ||||
1235 | |||||
1236 | # Fake it out as a begin/end | ||||
1237 | my $target; | ||||
1238 | |||||
1239 | if(grep $_->[1]{'~ignore'}, @$curr_open) { | ||||
1240 | DEBUG > 1 and print STDERR "Ignoring ignorable =for\n"; | ||||
1241 | return 1; | ||||
1242 | } | ||||
1243 | |||||
1244 | for(my $i = 2; $i < @$para; ++$i) { | ||||
1245 | if($para->[$i] =~ s/^\s*(\S+)\s*//s) { | ||||
1246 | $target = $1; | ||||
1247 | last; | ||||
1248 | } | ||||
1249 | } | ||||
1250 | unless(defined $target) { | ||||
1251 | $self->whine( | ||||
1252 | $para->[1]{'start_line'}, | ||||
1253 | "=for without a target?" | ||||
1254 | ); | ||||
1255 | return 1; | ||||
1256 | } | ||||
1257 | DEBUG > 1 and | ||||
1258 | print STDERR "Faking out a =for $target as a =begin $target / =end $target\n"; | ||||
1259 | |||||
1260 | $para->[0] = 'Data'; | ||||
1261 | |||||
1262 | unshift @$paras, | ||||
1263 | ['=begin', | ||||
1264 | {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'}, | ||||
1265 | $target, | ||||
1266 | ], | ||||
1267 | $para, | ||||
1268 | ['=end', | ||||
1269 | {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'}, | ||||
1270 | $target, | ||||
1271 | ], | ||||
1272 | ; | ||||
1273 | |||||
1274 | return 1; | ||||
1275 | } | ||||
1276 | |||||
1277 | sub _ponder_begin { | ||||
1278 | my ($self,$para,$curr_open,$paras) = @_; | ||||
1279 | my $content = join ' ', splice @$para, 2; | ||||
1280 | $content =~ s/^\s+//s; | ||||
1281 | $content =~ s/\s+$//s; | ||||
1282 | unless(length($content)) { | ||||
1283 | $self->whine( | ||||
1284 | $para->[1]{'start_line'}, | ||||
1285 | "=begin without a target?" | ||||
1286 | ); | ||||
1287 | DEBUG and print STDERR "Ignoring targetless =begin\n"; | ||||
1288 | return 1; | ||||
1289 | } | ||||
1290 | |||||
1291 | my ($target, $title) = $content =~ m/^(\S+)\s*(.*)$/; | ||||
1292 | $para->[1]{'title'} = $title if ($title); | ||||
1293 | $para->[1]{'target'} = $target; # without any ':' | ||||
1294 | $content = $target; # strip off the title | ||||
1295 | |||||
1296 | $content =~ s/^:!/!:/s; | ||||
1297 | my $neg; # whether this is a negation-match | ||||
1298 | $neg = 1 if $content =~ s/^!//s; | ||||
1299 | my $to_resolve; # whether to process formatting codes | ||||
1300 | $to_resolve = 1 if $content =~ s/^://s; | ||||
1301 | |||||
1302 | my $dont_ignore; # whether this target matches us | ||||
1303 | |||||
1304 | foreach my $target_name ( | ||||
1305 | split(',', $content, -1), | ||||
1306 | $neg ? () : '*' | ||||
1307 | ) { | ||||
1308 | DEBUG > 2 and | ||||
1309 | print STDERR " Considering whether =begin $content matches $target_name\n"; | ||||
1310 | next unless $self->{'accept_targets'}{$target_name}; | ||||
1311 | |||||
1312 | DEBUG > 2 and | ||||
1313 | print STDERR " It DOES match the acceptable target $target_name!\n"; | ||||
1314 | $to_resolve = 1 | ||||
1315 | if $self->{'accept_targets'}{$target_name} eq 'force_resolve'; | ||||
1316 | $dont_ignore = 1; | ||||
1317 | $para->[1]{'target_matching'} = $target_name; | ||||
1318 | last; # stop looking at other target names | ||||
1319 | } | ||||
1320 | |||||
1321 | if($neg) { | ||||
1322 | if( $dont_ignore ) { | ||||
1323 | $dont_ignore = ''; | ||||
1324 | delete $para->[1]{'target_matching'}; | ||||
1325 | DEBUG > 2 and print STDERR " But the leading ! means that this is a NON-match!\n"; | ||||
1326 | } else { | ||||
1327 | $dont_ignore = 1; | ||||
1328 | $para->[1]{'target_matching'} = '!'; | ||||
1329 | DEBUG > 2 and print STDERR " But the leading ! means that this IS a match!\n"; | ||||
1330 | } | ||||
1331 | } | ||||
1332 | |||||
1333 | $para->[0] = '=for'; # Just what we happen to call these, internally | ||||
1334 | $para->[1]{'~really'} ||= '=begin'; | ||||
1335 | $para->[1]{'~ignore'} = (! $dont_ignore) || 0; | ||||
1336 | $para->[1]{'~resolve'} = $to_resolve || 0; | ||||
1337 | |||||
1338 | DEBUG > 1 and print STDERR " Making note to ", $dont_ignore ? 'not ' : '', | ||||
1339 | "ignore contents of this region\n"; | ||||
1340 | DEBUG > 1 and $dont_ignore and print STDERR " Making note to treat contents as ", | ||||
1341 | ($to_resolve ? 'verbatim/plain' : 'data'), " paragraphs\n"; | ||||
1342 | DEBUG > 1 and print STDERR " (Stack now: ", $self->_dump_curr_open(), ")\n"; | ||||
1343 | |||||
1344 | push @$curr_open, $para; | ||||
1345 | if(!$dont_ignore or scalar grep $_->[1]{'~ignore'}, @$curr_open) { | ||||
1346 | DEBUG > 1 and print STDERR "Ignoring ignorable =begin\n"; | ||||
1347 | } else { | ||||
1348 | $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'}; | ||||
1349 | $self->_handle_element_start((my $scratch='for'), $para->[1]); | ||||
1350 | } | ||||
1351 | |||||
1352 | return 1; | ||||
1353 | } | ||||
1354 | |||||
1355 | sub _ponder_end { | ||||
1356 | my ($self,$para,$curr_open,$paras) = @_; | ||||
1357 | my $content = join ' ', splice @$para, 2; | ||||
1358 | $content =~ s/^\s+//s; | ||||
1359 | $content =~ s/\s+$//s; | ||||
1360 | DEBUG and print STDERR "Ogling '=end $content' directive\n"; | ||||
1361 | |||||
1362 | unless(length($content)) { | ||||
1363 | $self->whine( | ||||
1364 | $para->[1]{'start_line'}, | ||||
1365 | "'=end' without a target?" . ( | ||||
1366 | ( @$curr_open and $curr_open->[-1][0] eq '=for' ) | ||||
1367 | ? ( " (Should be \"=end " . $curr_open->[-1][1]{'target'} . '")' ) | ||||
1368 | : '' | ||||
1369 | ) | ||||
1370 | ); | ||||
1371 | DEBUG and print STDERR "Ignoring targetless =end\n"; | ||||
1372 | return 1; | ||||
1373 | } | ||||
1374 | |||||
1375 | unless($content =~ m/^\S+$/) { # i.e., unless it's one word | ||||
1376 | $self->whine( | ||||
1377 | $para->[1]{'start_line'}, | ||||
1378 | "'=end $content' is invalid. (Stack: " | ||||
1379 | . $self->_dump_curr_open() . ')' | ||||
1380 | ); | ||||
1381 | DEBUG and print STDERR "Ignoring mistargetted =end $content\n"; | ||||
1382 | return 1; | ||||
1383 | } | ||||
1384 | |||||
1385 | unless(@$curr_open and $curr_open->[-1][0] eq '=for') { | ||||
1386 | $self->whine( | ||||
1387 | $para->[1]{'start_line'}, | ||||
1388 | "=end $content without matching =begin. (Stack: " | ||||
1389 | . $self->_dump_curr_open() . ')' | ||||
1390 | ); | ||||
1391 | DEBUG and print STDERR "Ignoring mistargetted =end $content\n"; | ||||
1392 | return 1; | ||||
1393 | } | ||||
1394 | |||||
1395 | unless($content eq $curr_open->[-1][1]{'target'}) { | ||||
1396 | $self->whine( | ||||
1397 | $para->[1]{'start_line'}, | ||||
1398 | "=end $content doesn't match =begin " | ||||
1399 | . $curr_open->[-1][1]{'target'} | ||||
1400 | . ". (Stack: " | ||||
1401 | . $self->_dump_curr_open() . ')' | ||||
1402 | ); | ||||
1403 | DEBUG and print STDERR "Ignoring mistargetted =end $content at line $para->[1]{'start_line'}\n"; | ||||
1404 | return 1; | ||||
1405 | } | ||||
1406 | |||||
1407 | # Else it's okay to close... | ||||
1408 | if(grep $_->[1]{'~ignore'}, @$curr_open) { | ||||
1409 | DEBUG > 1 and print STDERR "Not firing any event for this =end $content because in an ignored region\n"; | ||||
1410 | # And that may be because of this to-be-closed =for region, or some | ||||
1411 | # other one, but it doesn't matter. | ||||
1412 | } else { | ||||
1413 | $curr_open->[-1][1]{'start_line'} = $para->[1]{'start_line'}; | ||||
1414 | # what's that for? | ||||
1415 | |||||
1416 | $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'}; | ||||
1417 | $self->_handle_element_end( my $scratch = 'for', $para->[1]); | ||||
1418 | } | ||||
1419 | DEBUG > 1 and print STDERR "Popping $curr_open->[-1][0] $curr_open->[-1][1]{'target'} because of =end $content\n"; | ||||
1420 | pop @$curr_open; | ||||
1421 | |||||
1422 | return 1; | ||||
1423 | } | ||||
1424 | |||||
1425 | sub _ponder_doc_end { | ||||
1426 | my ($self,$para,$curr_open,$paras) = @_; | ||||
1427 | if(@$curr_open) { # Deal with things left open | ||||
1428 | DEBUG and print STDERR "Stack is nonempty at end-document: (", | ||||
1429 | $self->_dump_curr_open(), ")\n"; | ||||
1430 | |||||
1431 | DEBUG > 9 and print STDERR "Stack: ", pretty($curr_open), "\n"; | ||||
1432 | unshift @$paras, $self->_closers_for_all_curr_open; | ||||
1433 | # Make sure there is exactly one ~end in the parastack, at the end: | ||||
1434 | @$paras = grep $_->[0] ne '~end', @$paras; | ||||
1435 | push @$paras, $para, $para; | ||||
1436 | # We need two -- once for the next cycle where we | ||||
1437 | # generate errata, and then another to be at the end | ||||
1438 | # when that loop back around to process the errata. | ||||
1439 | return 1; | ||||
1440 | |||||
1441 | } else { | ||||
1442 | DEBUG and print STDERR "Okay, stack is empty now.\n"; | ||||
1443 | } | ||||
1444 | |||||
1445 | # Try generating errata section, if applicable | ||||
1446 | unless($self->{'~tried_gen_errata'}) { | ||||
1447 | $self->{'~tried_gen_errata'} = 1; | ||||
1448 | my @extras = $self->_gen_errata(); | ||||
1449 | if(@extras) { | ||||
1450 | unshift @$paras, @extras; | ||||
1451 | DEBUG and print STDERR "Generated errata... relooping...\n"; | ||||
1452 | return 1; # I.e., loop around again to process these fake-o paragraphs | ||||
1453 | } | ||||
1454 | } | ||||
1455 | |||||
1456 | splice @$paras; # Well, that's that for this paragraph buffer. | ||||
1457 | DEBUG and print STDERR "Throwing end-document event.\n"; | ||||
1458 | |||||
1459 | $self->_handle_element_end( my $scratch = 'Document' ); | ||||
1460 | return 1; # Hasta la byebye | ||||
1461 | } | ||||
1462 | |||||
1463 | sub _ponder_pod { | ||||
1464 | my ($self,$para,$curr_open,$paras) = @_; | ||||
1465 | $self->whine( | ||||
1466 | $para->[1]{'start_line'}, | ||||
1467 | "=pod directives shouldn't be over one line long! Ignoring all " | ||||
1468 | . (@$para - 2) . " lines of content" | ||||
1469 | ) if @$para > 3; | ||||
1470 | |||||
1471 | # Content ignored unless 'pod_handler' is set | ||||
1472 | if (my $pod_handler = $self->{'pod_handler'}) { | ||||
1473 | my ($line_num, $line) = map $_, $para->[1]{'start_line'}, $para->[2]; | ||||
1474 | $line = $line eq '' ? "=pod" : "=pod $line"; # imitate cut_handler output | ||||
1475 | $pod_handler->($line, $line_num, $self); | ||||
1476 | } | ||||
1477 | |||||
1478 | # The surrounding methods set content_seen, so let us remain consistent. | ||||
1479 | # I do not know why it was not here before -- should it not be here? | ||||
1480 | # $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'}; | ||||
1481 | |||||
1482 | return; | ||||
1483 | } | ||||
1484 | |||||
1485 | sub _ponder_over { | ||||
1486 | my ($self,$para,$curr_open,$paras) = @_; | ||||
1487 | return 1 unless @$paras; | ||||
1488 | my $list_type; | ||||
1489 | |||||
1490 | if($paras->[0][0] eq '=item') { # most common case | ||||
1491 | $list_type = $self->_get_initial_item_type($paras->[0]); | ||||
1492 | |||||
1493 | } elsif($paras->[0][0] eq '=back') { | ||||
1494 | # Ignore empty lists by default | ||||
1495 | if ($self->{'parse_empty_lists'}) { | ||||
1496 | $list_type = 'empty'; | ||||
1497 | } else { | ||||
1498 | shift @$paras; | ||||
1499 | return 1; | ||||
1500 | } | ||||
1501 | } elsif($paras->[0][0] eq '~end') { | ||||
1502 | $self->whine( | ||||
1503 | $para->[1]{'start_line'}, | ||||
1504 | "=over is the last thing in the document?!" | ||||
1505 | ); | ||||
1506 | return 1; # But feh, ignore it. | ||||
1507 | } else { | ||||
1508 | $list_type = 'block'; | ||||
1509 | } | ||||
1510 | $para->[1]{'~type'} = $list_type; | ||||
1511 | push @$curr_open, $para; | ||||
1512 | # yes, we reuse the paragraph as a stack item | ||||
1513 | |||||
1514 | my $content = join ' ', splice @$para, 2; | ||||
1515 | $para->[1]{'~orig_content'} = $content; | ||||
1516 | my $overness; | ||||
1517 | if($content =~ m/^\s*$/s) { | ||||
1518 | $para->[1]{'indent'} = 4; | ||||
1519 | } elsif($content =~ m/^\s*((?:\d*\.)?\d+)\s*$/s) { | ||||
1520 | 2 | 2.83ms | 2 | 15µs | # spent 13µs (10+2) within Pod::Simple::BlackBox::BEGIN@1520 which was called:
# once (10µs+2µs) by Pod::Simple::LinkSection::BEGIN@7 at line 1520 # spent 13µs making 1 call to Pod::Simple::BlackBox::BEGIN@1520
# spent 2µs making 1 call to integer::unimport |
1521 | $para->[1]{'indent'} = $1; | ||||
1522 | if($1 == 0) { | ||||
1523 | $self->whine( | ||||
1524 | $para->[1]{'start_line'}, | ||||
1525 | "Can't have a 0 in =over $content" | ||||
1526 | ); | ||||
1527 | $para->[1]{'indent'} = 4; | ||||
1528 | } | ||||
1529 | } else { | ||||
1530 | $self->whine( | ||||
1531 | $para->[1]{'start_line'}, | ||||
1532 | "=over should be: '=over' or '=over positive_number'" | ||||
1533 | ); | ||||
1534 | $para->[1]{'indent'} = 4; | ||||
1535 | } | ||||
1536 | DEBUG > 1 and print STDERR "=over found of type $list_type\n"; | ||||
1537 | |||||
1538 | $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'}; | ||||
1539 | $self->_handle_element_start((my $scratch = 'over-' . $list_type), $para->[1]); | ||||
1540 | |||||
1541 | return; | ||||
1542 | } | ||||
1543 | |||||
1544 | sub _ponder_back { | ||||
1545 | my ($self,$para,$curr_open,$paras) = @_; | ||||
1546 | # TODO: fire off </item-number> or </item-bullet> or </item-text> ?? | ||||
1547 | |||||
1548 | my $content = join ' ', splice @$para, 2; | ||||
1549 | if($content =~ m/\S/) { | ||||
1550 | $self->whine( | ||||
1551 | $para->[1]{'start_line'}, | ||||
1552 | "=back doesn't take any parameters, but you said =back $content" | ||||
1553 | ); | ||||
1554 | } | ||||
1555 | |||||
1556 | if(@$curr_open and $curr_open->[-1][0] eq '=over') { | ||||
1557 | DEBUG > 1 and print STDERR "=back happily closes matching =over\n"; | ||||
1558 | # Expected case: we're closing the most recently opened thing | ||||
1559 | #my $over = pop @$curr_open; | ||||
1560 | $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'}; | ||||
1561 | $self->_handle_element_end( my $scratch = | ||||
1562 | 'over-' . ( (pop @$curr_open)->[1]{'~type'} ), $para->[1] | ||||
1563 | ); | ||||
1564 | } else { | ||||
1565 | DEBUG > 1 and print STDERR "=back found without a matching =over. Stack: (", | ||||
1566 | join(', ', map $_->[0], @$curr_open), ").\n"; | ||||
1567 | $self->whine( | ||||
1568 | $para->[1]{'start_line'}, | ||||
1569 | '=back without =over' | ||||
1570 | ); | ||||
1571 | return 1; # and ignore it | ||||
1572 | } | ||||
1573 | } | ||||
1574 | |||||
1575 | sub _ponder_item { | ||||
1576 | my ($self,$para,$curr_open,$paras) = @_; | ||||
1577 | my $over; | ||||
1578 | unless(@$curr_open and | ||||
1579 | $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) { | ||||
1580 | $self->whine( | ||||
1581 | $para->[1]{'start_line'}, | ||||
1582 | "'=item' outside of any '=over'" | ||||
1583 | ); | ||||
1584 | unshift @$paras, | ||||
1585 | ['=over', {'start_line' => $para->[1]{'start_line'}}, ''], | ||||
1586 | $para | ||||
1587 | ; | ||||
1588 | return 1; | ||||
1589 | } | ||||
1590 | |||||
1591 | |||||
1592 | my $over_type = $over->[1]{'~type'}; | ||||
1593 | |||||
1594 | if(!$over_type) { | ||||
1595 | # Shouldn't happen1 | ||||
1596 | die "Typeless over in stack, starting at line " | ||||
1597 | . $over->[1]{'start_line'}; | ||||
1598 | |||||
1599 | } elsif($over_type eq 'block') { | ||||
1600 | unless($curr_open->[-1][1]{'~bitched_about'}) { | ||||
1601 | $curr_open->[-1][1]{'~bitched_about'} = 1; | ||||
1602 | $self->whine( | ||||
1603 | $curr_open->[-1][1]{'start_line'}, | ||||
1604 | "You can't have =items (as at line " | ||||
1605 | . $para->[1]{'start_line'} | ||||
1606 | . ") unless the first thing after the =over is an =item" | ||||
1607 | ); | ||||
1608 | } | ||||
1609 | # Just turn it into a paragraph and reconsider it | ||||
1610 | $para->[0] = '~Para'; | ||||
1611 | unshift @$paras, $para; | ||||
1612 | return 1; | ||||
1613 | |||||
1614 | } elsif($over_type eq 'text') { | ||||
1615 | my $item_type = $self->_get_item_type($para); | ||||
1616 | # That kills the content of the item if it's a number or bullet. | ||||
1617 | DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; | ||||
1618 | |||||
1619 | if($item_type eq 'text') { | ||||
1620 | # Nothing special needs doing for 'text' | ||||
1621 | } elsif($item_type eq 'number' or $item_type eq 'bullet') { | ||||
1622 | $self->whine( | ||||
1623 | $para->[1]{'start_line'}, | ||||
1624 | "Expected text after =item, not a $item_type" | ||||
1625 | ); | ||||
1626 | # Undo our clobbering: | ||||
1627 | push @$para, $para->[1]{'~orig_content'}; | ||||
1628 | delete $para->[1]{'number'}; | ||||
1629 | # Only a PROPER item-number element is allowed | ||||
1630 | # to have a number attribute. | ||||
1631 | } else { | ||||
1632 | die "Unhandled item type $item_type"; # should never happen | ||||
1633 | } | ||||
1634 | |||||
1635 | # =item-text thingies don't need any assimilation, it seems. | ||||
1636 | |||||
1637 | } elsif($over_type eq 'number') { | ||||
1638 | my $item_type = $self->_get_item_type($para); | ||||
1639 | # That kills the content of the item if it's a number or bullet. | ||||
1640 | DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; | ||||
1641 | |||||
1642 | my $expected_value = ++ $curr_open->[-1][1]{'~counter'}; | ||||
1643 | |||||
1644 | if($item_type eq 'bullet') { | ||||
1645 | # Hm, it's not numeric. Correct for this. | ||||
1646 | $para->[1]{'number'} = $expected_value; | ||||
1647 | $self->whine( | ||||
1648 | $para->[1]{'start_line'}, | ||||
1649 | "Expected '=item $expected_value'" | ||||
1650 | ); | ||||
1651 | push @$para, $para->[1]{'~orig_content'}; | ||||
1652 | # restore the bullet, blocking the assimilation of next para | ||||
1653 | |||||
1654 | } elsif($item_type eq 'text') { | ||||
1655 | # Hm, it's not numeric. Correct for this. | ||||
1656 | $para->[1]{'number'} = $expected_value; | ||||
1657 | $self->whine( | ||||
1658 | $para->[1]{'start_line'}, | ||||
1659 | "Expected '=item $expected_value'" | ||||
1660 | ); | ||||
1661 | # Text content will still be there and will block next ~Para | ||||
1662 | |||||
1663 | } elsif($item_type ne 'number') { | ||||
1664 | die "Unknown item type $item_type"; # should never happen | ||||
1665 | |||||
1666 | } elsif($expected_value == $para->[1]{'number'}) { | ||||
1667 | DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n"; | ||||
1668 | |||||
1669 | } else { | ||||
1670 | DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'}, | ||||
1671 | " instead of the expected value of $expected_value\n"; | ||||
1672 | $self->whine( | ||||
1673 | $para->[1]{'start_line'}, | ||||
1674 | "You have '=item " . $para->[1]{'number'} . | ||||
1675 | "' instead of the expected '=item $expected_value'" | ||||
1676 | ); | ||||
1677 | $para->[1]{'number'} = $expected_value; # correcting!! | ||||
1678 | } | ||||
1679 | |||||
1680 | if(@$para == 2) { | ||||
1681 | # For the cases where we /didn't/ push to @$para | ||||
1682 | if($paras->[0][0] eq '~Para') { | ||||
1683 | DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n"; | ||||
1684 | push @$para, splice @{shift @$paras},2; | ||||
1685 | } else { | ||||
1686 | DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n"; | ||||
1687 | push @$para, ''; # Just so it's not contentless | ||||
1688 | } | ||||
1689 | } | ||||
1690 | |||||
1691 | |||||
1692 | } elsif($over_type eq 'bullet') { | ||||
1693 | my $item_type = $self->_get_item_type($para); | ||||
1694 | # That kills the content of the item if it's a number or bullet. | ||||
1695 | DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; | ||||
1696 | |||||
1697 | if($item_type eq 'bullet') { | ||||
1698 | # as expected! | ||||
1699 | |||||
1700 | if( $para->[1]{'~_freaky_para_hack'} ) { | ||||
1701 | DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n"; | ||||
1702 | push @$para, $para->[1]{'~_freaky_para_hack'}; | ||||
1703 | } | ||||
1704 | |||||
1705 | } elsif($item_type eq 'number') { | ||||
1706 | $self->whine( | ||||
1707 | $para->[1]{'start_line'}, | ||||
1708 | "Expected '=item *'" | ||||
1709 | ); | ||||
1710 | push @$para, $para->[1]{'~orig_content'}; | ||||
1711 | # and block assimilation of the next paragraph | ||||
1712 | delete $para->[1]{'number'}; | ||||
1713 | # Only a PROPER item-number element is allowed | ||||
1714 | # to have a number attribute. | ||||
1715 | } elsif($item_type eq 'text') { | ||||
1716 | $self->whine( | ||||
1717 | $para->[1]{'start_line'}, | ||||
1718 | "Expected '=item *'" | ||||
1719 | ); | ||||
1720 | # But doesn't need processing. But it'll block assimilation | ||||
1721 | # of the next para. | ||||
1722 | } else { | ||||
1723 | die "Unhandled item type $item_type"; # should never happen | ||||
1724 | } | ||||
1725 | |||||
1726 | if(@$para == 2) { | ||||
1727 | # For the cases where we /didn't/ push to @$para | ||||
1728 | if($paras->[0][0] eq '~Para') { | ||||
1729 | DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n"; | ||||
1730 | push @$para, splice @{shift @$paras},2; | ||||
1731 | } else { | ||||
1732 | DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n"; | ||||
1733 | push @$para, ''; # Just so it's not contentless | ||||
1734 | } | ||||
1735 | } | ||||
1736 | |||||
1737 | } else { | ||||
1738 | die "Unhandled =over type \"$over_type\"?"; | ||||
1739 | # Shouldn't happen! | ||||
1740 | } | ||||
1741 | $para->[0] .= '-' . $over_type; | ||||
1742 | |||||
1743 | return; | ||||
1744 | } | ||||
1745 | |||||
1746 | sub _ponder_Plain { | ||||
1747 | my ($self,$para) = @_; | ||||
1748 | DEBUG and print STDERR " giving plain treatment...\n"; | ||||
1749 | unless( @$para == 2 or ( @$para == 3 and $para->[2] eq '' ) | ||||
1750 | or $para->[1]{'~cooked'} | ||||
1751 | ) { | ||||
1752 | push @$para, | ||||
1753 | @{$self->_make_treelet( | ||||
1754 | join("\n", splice(@$para, 2)), | ||||
1755 | $para->[1]{'start_line'} | ||||
1756 | )}; | ||||
1757 | } | ||||
1758 | # Empty paragraphs don't need a treelet for any reason I can see. | ||||
1759 | # And precooked paragraphs already have a treelet. | ||||
1760 | return; | ||||
1761 | } | ||||
1762 | |||||
1763 | sub _ponder_Verbatim { | ||||
1764 | my ($self,$para) = @_; | ||||
1765 | DEBUG and print STDERR " giving verbatim treatment...\n"; | ||||
1766 | |||||
1767 | $para->[1]{'xml:space'} = 'preserve'; | ||||
1768 | |||||
1769 | unless ($self->{'_output_is_for_JustPod'}) { | ||||
1770 | # Fix illegal settings for expand_verbatim_tabs() | ||||
1771 | # This is because this module doesn't do input error checking, but khw | ||||
1772 | # doesn't want to add yet another instance of that. | ||||
1773 | $self->expand_verbatim_tabs(8) | ||||
1774 | if ! defined $self->expand_verbatim_tabs() | ||||
1775 | || $self->expand_verbatim_tabs() =~ /\D/; | ||||
1776 | |||||
1777 | my $indent = $self->strip_verbatim_indent; | ||||
1778 | if ($indent && ref $indent eq 'CODE') { | ||||
1779 | my @shifted = (shift @{$para}, shift @{$para}); | ||||
1780 | $indent = $indent->($para); | ||||
1781 | unshift @{$para}, @shifted; | ||||
1782 | } | ||||
1783 | |||||
1784 | for(my $i = 2; $i < @$para; $i++) { | ||||
1785 | foreach my $line ($para->[$i]) { # just for aliasing | ||||
1786 | # Strip indentation. | ||||
1787 | $line =~ s/^\Q$indent// if $indent; | ||||
1788 | next unless $self->expand_verbatim_tabs; | ||||
1789 | |||||
1790 | # This is commented out because of github issue #85, and the | ||||
1791 | # current maintainers don't know why it was there in the first | ||||
1792 | # place. | ||||
1793 | #&& !($self->{accept_codes} && $self->{accept_codes}{VerbatimFormatted}); | ||||
1794 | while( $line =~ | ||||
1795 | # Sort of adapted from Text::Tabs. | ||||
1796 | s/^([^\t]*)(\t+)/$1.(" " x ((length($2) | ||||
1797 | * $self->expand_verbatim_tabs) | ||||
1798 | -(length($1)&7)))/e | ||||
1799 | ) {} | ||||
1800 | |||||
1801 | # TODO: whinge about (or otherwise treat) unindented or overlong lines | ||||
1802 | |||||
1803 | } | ||||
1804 | } | ||||
1805 | } | ||||
1806 | |||||
1807 | # Now the VerbatimFormatted hoodoo... | ||||
1808 | if( $self->{'accept_codes'} and | ||||
1809 | $self->{'accept_codes'}{'VerbatimFormatted'} | ||||
1810 | ) { | ||||
1811 | while(@$para > 3 and $para->[-1] !~ m/\S/) { pop @$para } | ||||
1812 | # Kill any number of terminal newlines | ||||
1813 | $self->_verbatim_format($para); | ||||
1814 | } elsif ($self->{'codes_in_verbatim'}) { | ||||
1815 | push @$para, | ||||
1816 | @{$self->_make_treelet( | ||||
1817 | join("\n", splice(@$para, 2)), | ||||
1818 | $para->[1]{'start_line'}, $para->[1]{'xml:space'} | ||||
1819 | )}; | ||||
1820 | $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines | ||||
1821 | } else { | ||||
1822 | push @$para, join "\n", splice(@$para, 2) if @$para > 3; | ||||
1823 | $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines | ||||
1824 | } | ||||
1825 | return; | ||||
1826 | } | ||||
1827 | |||||
1828 | sub _ponder_Data { | ||||
1829 | my ($self,$para) = @_; | ||||
1830 | DEBUG and print STDERR " giving data treatment...\n"; | ||||
1831 | $para->[1]{'xml:space'} = 'preserve'; | ||||
1832 | push @$para, join "\n", splice(@$para, 2) if @$para > 3; | ||||
1833 | return; | ||||
1834 | } | ||||
1835 | |||||
- - | |||||
1839 | ########################################################################### | ||||
1840 | |||||
1841 | sub _traverse_treelet_bit { # for use only by the routine above | ||||
1842 | my($self, $name) = splice @_,0,2; | ||||
1843 | |||||
1844 | my $scratch; | ||||
1845 | $self->_handle_element_start(($scratch=$name), shift @_); | ||||
1846 | |||||
1847 | while (@_) { | ||||
1848 | my $x = shift; | ||||
1849 | if (ref($x)) { | ||||
1850 | &_traverse_treelet_bit($self, @$x); | ||||
1851 | } else { | ||||
1852 | $x .= shift while @_ && !ref($_[0]); | ||||
1853 | $self->_handle_text($x); | ||||
1854 | } | ||||
1855 | } | ||||
1856 | |||||
1857 | $self->_handle_element_end($scratch=$name); | ||||
1858 | return; | ||||
1859 | } | ||||
1860 | |||||
1861 | #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | ||||
1862 | |||||
1863 | sub _closers_for_all_curr_open { | ||||
1864 | my $self = $_[0]; | ||||
1865 | my @closers; | ||||
1866 | foreach my $still_open (@{ $self->{'curr_open'} || return }) { | ||||
1867 | my @copy = @$still_open; | ||||
1868 | $copy[1] = {%{ $copy[1] }}; | ||||
1869 | #$copy[1]{'start_line'} = -1; | ||||
1870 | if($copy[0] eq '=for') { | ||||
1871 | $copy[0] = '=end'; | ||||
1872 | } elsif($copy[0] eq '=over') { | ||||
1873 | $self->whine( | ||||
1874 | $still_open->[1]{start_line} , | ||||
1875 | "=over without closing =back" | ||||
1876 | ); | ||||
1877 | |||||
1878 | $copy[0] = '=back'; | ||||
1879 | } else { | ||||
1880 | die "I don't know how to auto-close an open $copy[0] region"; | ||||
1881 | } | ||||
1882 | |||||
1883 | unless( @copy > 2 ) { | ||||
1884 | push @copy, $copy[1]{'target'}; | ||||
1885 | $copy[-1] = '' unless defined $copy[-1]; | ||||
1886 | # since =over's don't have targets | ||||
1887 | } | ||||
1888 | |||||
1889 | $copy[1]{'fake-closer'} = 1; | ||||
1890 | |||||
1891 | DEBUG and print STDERR "Queuing up fake-o event: ", pretty(\@copy), "\n"; | ||||
1892 | unshift @closers, \@copy; | ||||
1893 | } | ||||
1894 | return @closers; | ||||
1895 | } | ||||
1896 | |||||
1897 | #-------------------------------------------------------------------------- | ||||
1898 | |||||
1899 | sub _verbatim_format { | ||||
1900 | my($it, $p) = @_; | ||||
1901 | |||||
1902 | my $formatting; | ||||
1903 | |||||
1904 | for(my $i = 2; $i < @$p; $i++) { # work backwards over the lines | ||||
1905 | DEBUG and print STDERR "_verbatim_format appends a newline to $i: $p->[$i]\n"; | ||||
1906 | $p->[$i] .= "\n"; | ||||
1907 | # Unlike with simple Verbatim blocks, we don't end up just doing | ||||
1908 | # a join("\n", ...) on the contents, so we have to append a | ||||
1909 | # newline to every line, and then nix the last one later. | ||||
1910 | } | ||||
1911 | |||||
1912 | if( DEBUG > 4 ) { | ||||
1913 | print STDERR "<<\n"; | ||||
1914 | for(my $i = $#$p; $i >= 2; $i--) { # work backwards over the lines | ||||
1915 | print STDERR "_verbatim_format $i: $p->[$i]"; | ||||
1916 | } | ||||
1917 | print STDERR ">>\n"; | ||||
1918 | } | ||||
1919 | |||||
1920 | for(my $i = $#$p; $i > 2; $i--) { | ||||
1921 | # work backwards over the lines, except the first (#2) | ||||
1922 | |||||
1923 | #next unless $p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s | ||||
1924 | # and $p->[$i-1] !~ m{^#:[ \^\/\%]*\n?$}s; | ||||
1925 | # look at a formatty line preceding a nonformatty one | ||||
1926 | DEBUG > 5 and print STDERR "Scrutinizing line $i: $$p[$i]\n"; | ||||
1927 | if($p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s) { | ||||
1928 | DEBUG > 5 and print STDERR " It's a formatty line. ", | ||||
1929 | "Peeking at previous line ", $i-1, ": $$p[$i-1]: \n"; | ||||
1930 | |||||
1931 | if( $p->[$i-1] =~ m{^#:[ \^\/\%]*\n?$}s ) { | ||||
1932 | DEBUG > 5 and print STDERR " Previous line is formatty! Skipping this one.\n"; | ||||
1933 | next; | ||||
1934 | } else { | ||||
1935 | DEBUG > 5 and print STDERR " Previous line is non-formatty! Yay!\n"; | ||||
1936 | } | ||||
1937 | } else { | ||||
1938 | DEBUG > 5 and print STDERR " It's not a formatty line. Ignoring\n"; | ||||
1939 | next; | ||||
1940 | } | ||||
1941 | |||||
1942 | # A formatty line has to have #: in the first two columns, and uses | ||||
1943 | # "^" to mean bold, "/" to mean underline, and "%" to mean bold italic. | ||||
1944 | # Example: | ||||
1945 | # What do you want? i like pie. [or whatever] | ||||
1946 | # #:^^^^^^^^^^^^^^^^^ ///////////// | ||||
1947 | |||||
1948 | |||||
1949 | DEBUG > 4 and print STDERR "_verbatim_format considers:\n<$p->[$i-1]>\n<$p->[$i]>\n"; | ||||
1950 | |||||
1951 | $formatting = ' ' . $1; | ||||
1952 | $formatting =~ s/\s+$//s; # nix trailing whitespace | ||||
1953 | unless(length $formatting and $p->[$i-1] =~ m/\S/) { # no-op | ||||
1954 | splice @$p,$i,1; # remove this line | ||||
1955 | $i--; # don't consider next line | ||||
1956 | next; | ||||
1957 | } | ||||
1958 | |||||
1959 | if( length($formatting) >= length($p->[$i-1]) ) { | ||||
1960 | $formatting = substr($formatting, 0, length($p->[$i-1]) - 1) . ' '; | ||||
1961 | } else { | ||||
1962 | $formatting .= ' ' x (length($p->[$i-1]) - length($formatting)); | ||||
1963 | } | ||||
1964 | # Make $formatting and the previous line be exactly the same length, | ||||
1965 | # with $formatting having a " " as the last character. | ||||
1966 | |||||
1967 | DEBUG > 4 and print STDERR "Formatting <$formatting> on <", $p->[$i-1], ">\n"; | ||||
1968 | |||||
1969 | |||||
1970 | my @new_line; | ||||
1971 | while( $formatting =~ m{\G(( +)|(\^+)|(\/+)|(\%+))}g ) { | ||||
1972 | #print STDERR "Format matches $1\n"; | ||||
1973 | |||||
1974 | if($2) { | ||||
1975 | #print STDERR "SKIPPING <$2>\n"; | ||||
1976 | push @new_line, | ||||
1977 | substr($p->[$i-1], pos($formatting)-length($1), length($1)); | ||||
1978 | } else { | ||||
1979 | #print STDERR "SNARING $+\n"; | ||||
1980 | push @new_line, [ | ||||
1981 | ( | ||||
1982 | $3 ? 'VerbatimB' : | ||||
1983 | $4 ? 'VerbatimI' : | ||||
1984 | $5 ? 'VerbatimBI' : die("Should never get called") | ||||
1985 | ), {}, | ||||
1986 | substr($p->[$i-1], pos($formatting)-length($1), length($1)) | ||||
1987 | ]; | ||||
1988 | #print STDERR "Formatting <$new_line[-1][-1]> as $new_line[-1][0]\n"; | ||||
1989 | } | ||||
1990 | } | ||||
1991 | my @nixed = | ||||
1992 | splice @$p, $i-1, 2, @new_line; # replace myself and the next line | ||||
1993 | DEBUG > 10 and print STDERR "Nixed count: ", scalar(@nixed), "\n"; | ||||
1994 | |||||
1995 | DEBUG > 6 and print STDERR "New version of the above line is these tokens (", | ||||
1996 | scalar(@new_line), "):", | ||||
1997 | map( ref($_)?"<@$_> ":"<$_>", @new_line ), "\n"; | ||||
1998 | $i--; # So the next line we scrutinize is the line before the one | ||||
1999 | # that we just went and formatted | ||||
2000 | } | ||||
2001 | |||||
2002 | $p->[0] = 'VerbatimFormatted'; | ||||
2003 | |||||
2004 | # Collapse adjacent text nodes, just for kicks. | ||||
2005 | for( my $i = 2; $i > $#$p; $i++ ) { # work forwards over the tokens except for the last | ||||
2006 | if( !ref($p->[$i]) and !ref($p->[$i + 1]) ) { | ||||
2007 | DEBUG > 5 and print STDERR "_verbatim_format merges {$p->[$i]} and {$p->[$i+1]}\n"; | ||||
2008 | $p->[$i] .= splice @$p, $i+1, 1; # merge | ||||
2009 | --$i; # and back up | ||||
2010 | } | ||||
2011 | } | ||||
2012 | |||||
2013 | # Now look for the last text token, and remove the terminal newline | ||||
2014 | for( my $i = $#$p; $i >= 2; $i-- ) { | ||||
2015 | # work backwards over the tokens, even the first | ||||
2016 | if( !ref($p->[$i]) ) { | ||||
2017 | if($p->[$i] =~ s/\n$//s) { | ||||
2018 | DEBUG > 5 and print STDERR "_verbatim_format killed the terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]}\n"; | ||||
2019 | } else { | ||||
2020 | DEBUG > 5 and print STDERR | ||||
2021 | "No terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]} !?\n"; | ||||
2022 | } | ||||
2023 | last; # we only want the next one | ||||
2024 | } | ||||
2025 | } | ||||
2026 | |||||
2027 | return; | ||||
2028 | } | ||||
2029 | |||||
2030 | |||||
2031 | #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | ||||
2032 | |||||
2033 | |||||
2034 | sub _treelet_from_formatting_codes { | ||||
2035 | # Given a paragraph, returns a treelet. Full of scary tokenizing code. | ||||
2036 | # Like [ '~Top', {'start_line' => $start_line}, | ||||
2037 | # "I like ", | ||||
2038 | # [ 'B', {}, "pie" ], | ||||
2039 | # "!" | ||||
2040 | # ] | ||||
2041 | # This illustrates the general format of a treelet. It is an array: | ||||
2042 | # [0] is a scalar indicating its type. In the example above, the | ||||
2043 | # types are '~Top' and 'B' | ||||
2044 | # [1] is a hash of various flags about it, possibly empty | ||||
2045 | # [2] - [N] are an ordered list of the subcomponents of the treelet. | ||||
2046 | # Scalars are literal text, refs are sub-treelets, to | ||||
2047 | # arbitrary levels. Stringifying a treelet will recursively | ||||
2048 | # stringify the sub-treelets, concatentating everything | ||||
2049 | # together to form the exact text of the treelet. | ||||
2050 | |||||
2051 | my($self, $para, $start_line, $preserve_space) = @_; | ||||
2052 | |||||
2053 | my $treelet = ['~Top', {'start_line' => $start_line},]; | ||||
2054 | |||||
2055 | unless ($preserve_space || $self->{'preserve_whitespace'}) { | ||||
2056 | $para =~ s/\s+/ /g; # collapse and trim all whitespace first. | ||||
2057 | $para =~ s/ $//; | ||||
2058 | $para =~ s/^ //; | ||||
2059 | } | ||||
2060 | |||||
2061 | # Only apparent problem the above code is that N<< >> turns into | ||||
2062 | # N<< >>. But then, word wrapping does that too! So don't do that! | ||||
2063 | |||||
2064 | |||||
2065 | # As a Start-code is encountered, the number of opening bracket '<' | ||||
2066 | # characters minus 1 is pushed onto @stack (so 0 means a single bracket, | ||||
2067 | # etc). When closing brackets are found in the text, at least this number | ||||
2068 | # (plus the 1) will be required to mean the Start-code is terminated. When | ||||
2069 | # those are found, @stack is popped. | ||||
2070 | my @stack; | ||||
2071 | |||||
2072 | my @lineage = ($treelet); | ||||
2073 | my $raw = ''; # raw content of L<> fcode before splitting/processing | ||||
2074 | # XXX 'raw' is not 100% accurate: all surrounding whitespace is condensed | ||||
2075 | # into just 1 ' '. Is this the regex's doing or 'raw's? Answer is it's | ||||
2076 | # the 'collapse and trim all whitespace first' lines just above. | ||||
2077 | my $inL = 0; | ||||
2078 | |||||
2079 | DEBUG > 4 and print STDERR "Paragraph:\n$para\n\n"; | ||||
2080 | |||||
2081 | # Here begins our frightening tokenizer RE. The following regex matches | ||||
2082 | # text in four main parts: | ||||
2083 | # | ||||
2084 | # * Start-codes. The first alternative matches C< or C<<, the latter | ||||
2085 | # followed by some whitespace. $1 will hold the entire start code | ||||
2086 | # (including any space following a multiple-angle-bracket delimiter), | ||||
2087 | # and $2 will hold only the additional brackets past the first in a | ||||
2088 | # multiple-bracket delimiter. length($2) + 1 will be the number of | ||||
2089 | # closing brackets we have to find. | ||||
2090 | # | ||||
2091 | # * Closing brackets. Match some amount of whitespace followed by | ||||
2092 | # multiple close brackets. The logic to see if this closes anything | ||||
2093 | # is down below. Note that in order to parse C<< >> correctly, we | ||||
2094 | # have to use look-behind (?<=\s\s), since the match of the starting | ||||
2095 | # code will have consumed the whitespace. | ||||
2096 | # | ||||
2097 | # * A single closing bracket, to close a simple code like C<>. | ||||
2098 | # | ||||
2099 | # * Something that isn't a start or end code. We have to be careful | ||||
2100 | # about accepting whitespace, since perlpodspec says that any whitespace | ||||
2101 | # before a multiple-bracket closing delimiter should be ignored. | ||||
2102 | # | ||||
2103 | while($para =~ | ||||
2104 | m/\G | ||||
2105 | (?: | ||||
2106 | # Match starting codes, including the whitespace following a | ||||
2107 | # multiple-delimiter start code. $1 gets the whole start code and | ||||
2108 | # $2 gets all but one of the <s in the multiple-bracket case. | ||||
2109 | ([A-Z]<(?:(<+)\s+)?) | ||||
2110 | | | ||||
2111 | # Match multiple-bracket end codes. $3 gets the whitespace that | ||||
2112 | # should be discarded before an end bracket but kept in other cases | ||||
2113 | # and $4 gets the end brackets themselves. ($3 can be empty if the | ||||
2114 | # construct is empty, like C<< >>, and all the white-space has been | ||||
2115 | # gobbled up already, considered to be space after the opening | ||||
2116 | # bracket. In this case we use look-behind to verify that there are | ||||
2117 | # at least 2 spaces in a row before the ">".) | ||||
2118 | (\s+|(?<=\s\s))(>{2,}) | ||||
2119 | | | ||||
2120 | (\s?>) # $5: simple end-codes | ||||
2121 | | | ||||
2122 | ( # $6: stuff containing no start-codes or end-codes | ||||
2123 | (?: | ||||
2124 | [^A-Z\s>] | ||||
2125 | | | ||||
2126 | (?: | ||||
2127 | [A-Z](?!<) | ||||
2128 | ) | ||||
2129 | | | ||||
2130 | # whitespace is ok, but we don't want to eat the whitespace before | ||||
2131 | # a multiple-bracket end code. | ||||
2132 | # NOTE: we may still have problems with e.g. S<< >> | ||||
2133 | (?: | ||||
2134 | \s(?!\s*>{2,}) | ||||
2135 | ) | ||||
2136 | )+ | ||||
2137 | ) | ||||
2138 | ) | ||||
2139 | /xgo | ||||
2140 | ) { | ||||
2141 | DEBUG > 4 and print STDERR "\nParagraphic tokenstack = (@stack)\n"; | ||||
2142 | if(defined $1) { | ||||
2143 | my $bracket_count; # How many '<<<' in a row this has. Needed for | ||||
2144 | # Pod::Simple::JustPod | ||||
2145 | if(defined $2) { | ||||
2146 | DEBUG > 3 and print STDERR "Found complex start-text code \"$1\"\n"; | ||||
2147 | $bracket_count = length($2) + 1; | ||||
2148 | push @stack, $bracket_count; # length of the necessary complex | ||||
2149 | # end-code string | ||||
2150 | } else { | ||||
2151 | DEBUG > 3 and print STDERR "Found simple start-text code \"$1\"\n"; | ||||
2152 | push @stack, 0; # signal that we're looking for simple | ||||
2153 | $bracket_count = 1; | ||||
2154 | } | ||||
2155 | my $code = substr($1,0,1); | ||||
2156 | if ('L' eq $code) { | ||||
2157 | if ($inL) { | ||||
2158 | $raw .= $1; | ||||
2159 | $self->scream( $start_line, | ||||
2160 | 'Nested L<> are illegal. Pretending inner one is ' | ||||
2161 | . 'X<...> so can continue looking for other errors.'); | ||||
2162 | $code = "X"; | ||||
2163 | } | ||||
2164 | else { | ||||
2165 | $raw = ""; # reset raw content accumulator | ||||
2166 | $inL = @stack; | ||||
2167 | } | ||||
2168 | } else { | ||||
2169 | $raw .= $1 if $inL; | ||||
2170 | } | ||||
2171 | push @lineage, [ $code, {}, ]; # new node object | ||||
2172 | |||||
2173 | # Tell Pod::Simple::JustPod how many brackets there were, but to save | ||||
2174 | # space, not in the most usual case of there was just 1. It can be | ||||
2175 | # inferred by the absence of this element. Similarly, if there is more | ||||
2176 | # than one bracket, extract the white space between the final bracket | ||||
2177 | # and the real beginning of the interior. Save that if it isn't just a | ||||
2178 | # single space | ||||
2179 | if ($self->{'_output_is_for_JustPod'} && $bracket_count > 1) { | ||||
2180 | $lineage[-1][1]{'~bracket_count'} = $bracket_count; | ||||
2181 | my $lspacer = substr($1, 1 + $bracket_count); | ||||
2182 | $lineage[-1][1]{'~lspacer'} = $lspacer if $lspacer ne " "; | ||||
2183 | } | ||||
2184 | push @{ $lineage[-2] }, $lineage[-1]; | ||||
2185 | } elsif(defined $4) { | ||||
2186 | DEBUG > 3 and print STDERR "Found apparent complex end-text code \"$3$4\"\n"; | ||||
2187 | # This is where it gets messy... | ||||
2188 | if(! @stack) { | ||||
2189 | # We saw " >>>>" but needed nothing. This is ALL just stuff then. | ||||
2190 | DEBUG > 4 and print STDERR " But it's really just stuff.\n"; | ||||
2191 | push @{ $lineage[-1] }, $3, $4; | ||||
2192 | next; | ||||
2193 | } elsif(!$stack[-1]) { | ||||
2194 | # We saw " >>>>" but needed only ">". Back pos up. | ||||
2195 | DEBUG > 4 and print STDERR " And that's more than we needed to close simple.\n"; | ||||
2196 | push @{ $lineage[-1] }, $3; # That was a for-real space, too. | ||||
2197 | pos($para) = pos($para) - length($4) + 1; | ||||
2198 | } elsif($stack[-1] == length($4)) { | ||||
2199 | # We found " >>>>", and it was exactly what we needed. Commonest case. | ||||
2200 | DEBUG > 4 and print STDERR " And that's exactly what we needed to close complex.\n"; | ||||
2201 | } elsif($stack[-1] < length($4)) { | ||||
2202 | # We saw " >>>>" but needed only " >>". Back pos up. | ||||
2203 | DEBUG > 4 and print STDERR " And that's more than we needed to close complex.\n"; | ||||
2204 | pos($para) = pos($para) - length($4) + $stack[-1]; | ||||
2205 | } else { | ||||
2206 | # We saw " >>>>" but needed " >>>>>>". So this is all just stuff! | ||||
2207 | DEBUG > 4 and print STDERR " But it's really just stuff, because we needed more.\n"; | ||||
2208 | push @{ $lineage[-1] }, $3, $4; | ||||
2209 | next; | ||||
2210 | } | ||||
2211 | #print STDERR "\nHOOBOY ", scalar(@{$lineage[-1]}), "!!!\n"; | ||||
2212 | |||||
2213 | if ($3 ne " " && $self->{'_output_is_for_JustPod'}) { | ||||
2214 | if ($3 ne "") { | ||||
2215 | $lineage[-1][1]{'~rspacer'} = $3; | ||||
2216 | } | ||||
2217 | elsif ($lineage[-1][1]{'~lspacer'} eq " ") { | ||||
2218 | |||||
2219 | # Here we had something like C<< >> which was a false positive | ||||
2220 | delete $lineage[-1][1]{'~lspacer'}; | ||||
2221 | } | ||||
2222 | else { | ||||
2223 | $lineage[-1][1]{'~rspacer'} | ||||
2224 | = substr($lineage[-1][1]{'~lspacer'}, -1, 1); | ||||
2225 | chop $lineage[-1][1]{'~lspacer'}; | ||||
2226 | } | ||||
2227 | } | ||||
2228 | |||||
2229 | push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] }; | ||||
2230 | # Keep the element from being childless | ||||
2231 | |||||
2232 | if ($inL == @stack) { | ||||
2233 | $lineage[-1][1]{'raw'} = $raw; | ||||
2234 | $inL = 0; | ||||
2235 | } | ||||
2236 | |||||
2237 | pop @stack; | ||||
2238 | pop @lineage; | ||||
2239 | |||||
2240 | $raw .= $3.$4 if $inL; | ||||
2241 | |||||
2242 | } elsif(defined $5) { | ||||
2243 | DEBUG > 3 and print STDERR "Found apparent simple end-text code \"$5\"\n"; | ||||
2244 | |||||
2245 | if(@stack and ! $stack[-1]) { | ||||
2246 | # We're indeed expecting a simple end-code | ||||
2247 | DEBUG > 4 and print STDERR " It's indeed an end-code.\n"; | ||||
2248 | |||||
2249 | if(length($5) == 2) { # There was a space there: " >" | ||||
2250 | push @{ $lineage[-1] }, ' '; | ||||
2251 | } elsif( 2 == @{ $lineage[-1] } ) { # Closing a childless element | ||||
2252 | push @{ $lineage[-1] }, ''; # keep it from being really childless | ||||
2253 | } | ||||
2254 | |||||
2255 | if ($inL == @stack) { | ||||
2256 | $lineage[-1][1]{'raw'} = $raw; | ||||
2257 | $inL = 0; | ||||
2258 | } | ||||
2259 | |||||
2260 | pop @stack; | ||||
2261 | pop @lineage; | ||||
2262 | } else { | ||||
2263 | DEBUG > 4 and print STDERR " It's just stuff.\n"; | ||||
2264 | push @{ $lineage[-1] }, $5; | ||||
2265 | } | ||||
2266 | |||||
2267 | $raw .= $5 if $inL; | ||||
2268 | |||||
2269 | } elsif(defined $6) { | ||||
2270 | DEBUG > 3 and print STDERR "Found stuff \"$6\"\n"; | ||||
2271 | push @{ $lineage[-1] }, $6; | ||||
2272 | $raw .= $6 if $inL; | ||||
2273 | # XXX does not capture multiplace whitespaces -- 'raw' ends up with | ||||
2274 | # at most 1 leading/trailing whitespace, why not all of it? | ||||
2275 | # Answer, because we deliberately trimmed it above | ||||
2276 | |||||
2277 | } else { | ||||
2278 | # should never ever ever ever happen | ||||
2279 | DEBUG and print STDERR "AYYAYAAAAA at line ", __LINE__, "\n"; | ||||
2280 | die "SPORK 512512!"; | ||||
2281 | } | ||||
2282 | } | ||||
2283 | |||||
2284 | if(@stack) { # Uhoh, some sequences weren't closed. | ||||
2285 | my $x= "..."; | ||||
2286 | while(@stack) { | ||||
2287 | push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] }; | ||||
2288 | # Hmmmmm! | ||||
2289 | |||||
2290 | my $code = (pop @lineage)->[0]; | ||||
2291 | my $ender_length = pop @stack; | ||||
2292 | if($ender_length) { | ||||
2293 | --$ender_length; | ||||
2294 | $x = $code . ("<" x $ender_length) . " $x " . (">" x $ender_length); | ||||
2295 | } else { | ||||
2296 | $x = $code . "<$x>"; | ||||
2297 | } | ||||
2298 | } | ||||
2299 | DEBUG > 1 and print STDERR "Unterminated $x sequence\n"; | ||||
2300 | $self->whine($start_line, | ||||
2301 | "Unterminated $x sequence", | ||||
2302 | ); | ||||
2303 | } | ||||
2304 | |||||
2305 | return $treelet; | ||||
2306 | } | ||||
2307 | |||||
2308 | #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | ||||
2309 | |||||
2310 | sub text_content_of_treelet { # method: $parser->text_content_of_treelet($lol) | ||||
2311 | return stringify_lol($_[1]); | ||||
2312 | } | ||||
2313 | |||||
2314 | sub stringify_lol { # function: stringify_lol($lol) | ||||
2315 | my $string_form = ''; | ||||
2316 | _stringify_lol( $_[0] => \$string_form ); | ||||
2317 | return $string_form; | ||||
2318 | } | ||||
2319 | |||||
2320 | sub _stringify_lol { # the real recursor | ||||
2321 | my($lol, $to) = @_; | ||||
2322 | for(my $i = 2; $i < @$lol; ++$i) { | ||||
2323 | if( ref($lol->[$i] || '') and UNIVERSAL::isa($lol->[$i], 'ARRAY') ) { | ||||
2324 | _stringify_lol( $lol->[$i], $to); # recurse! | ||||
2325 | } else { | ||||
2326 | $$to .= $lol->[$i]; | ||||
2327 | } | ||||
2328 | } | ||||
2329 | return; | ||||
2330 | } | ||||
2331 | |||||
2332 | #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | ||||
2333 | |||||
2334 | sub _dump_curr_open { # return a string representation of the stack | ||||
2335 | my $curr_open = $_[0]{'curr_open'}; | ||||
2336 | |||||
2337 | return '[empty]' unless @$curr_open; | ||||
2338 | return join '; ', | ||||
2339 | map {; | ||||
2340 | ($_->[0] eq '=for') | ||||
2341 | ? ( ($_->[1]{'~really'} || '=over') | ||||
2342 | . ' ' . $_->[1]{'target'}) | ||||
2343 | : $_->[0] | ||||
2344 | } | ||||
2345 | @$curr_open | ||||
2346 | ; | ||||
2347 | } | ||||
2348 | |||||
2349 | ########################################################################### | ||||
2350 | 1 | 5µs | my %pretty_form = ( | ||
2351 | "\a" => '\a', # ding! | ||||
2352 | "\b" => '\b', # BS | ||||
2353 | "\e" => '\e', # ESC | ||||
2354 | "\f" => '\f', # FF | ||||
2355 | "\t" => '\t', # tab | ||||
2356 | "\cm" => '\cm', | ||||
2357 | "\cj" => '\cj', | ||||
2358 | "\n" => '\n', # probably overrides one of either \cm or \cj | ||||
2359 | '"' => '\"', | ||||
2360 | '\\' => '\\\\', | ||||
2361 | '$' => '\\$', | ||||
2362 | '@' => '\\@', | ||||
2363 | '%' => '\\%', | ||||
2364 | '#' => '\\#', | ||||
2365 | ); | ||||
2366 | |||||
2367 | sub pretty { # adopted from Class::Classless | ||||
2368 | # Not the most brilliant routine, but passable. | ||||
2369 | # Don't give it a cyclic data structure! | ||||
2370 | my @stuff = @_; # copy | ||||
2371 | my $x; | ||||
2372 | my $out = | ||||
2373 | # join ",\n" . | ||||
2374 | join ", ", | ||||
2375 | map {; | ||||
2376 | if(!defined($_)) { | ||||
2377 | "undef"; | ||||
2378 | } elsif(ref($_) eq 'ARRAY' or ref($_) eq 'Pod::Simple::LinkSection') { | ||||
2379 | $x = "[ " . pretty(@$_) . " ]" ; | ||||
2380 | $x; | ||||
2381 | } elsif(ref($_) eq 'SCALAR') { | ||||
2382 | $x = "\\" . pretty($$_) ; | ||||
2383 | $x; | ||||
2384 | } elsif(ref($_) eq 'HASH') { | ||||
2385 | my $hr = $_; | ||||
2386 | $x = "{" . join(", ", | ||||
2387 | map(pretty($_) . '=>' . pretty($hr->{$_}), | ||||
2388 | sort keys %$hr ) ) . "}" ; | ||||
2389 | $x; | ||||
2390 | } elsif(!length($_)) { q{''} # empty string | ||||
2391 | } elsif( | ||||
2392 | $_ eq '0' # very common case | ||||
2393 | or( | ||||
2394 | m/^-?(?:[123456789]\d*|0)(?:\.\d+)?$/s | ||||
2395 | and $_ ne '-0' # the strange case that RE lets thru | ||||
2396 | ) | ||||
2397 | ) { $_; | ||||
2398 | } else { | ||||
2399 | # Yes, explicitly name every character desired. There are shorcuts one | ||||
2400 | # could make, but I (Karl Williamson) was afraid that some Perl | ||||
2401 | # releases would have bugs in some of them. For example [A-Z] works | ||||
2402 | # even on EBCDIC platforms to match exactly the 26 uppercase English | ||||
2403 | # letters, but I don't know if it has always worked without bugs. It | ||||
2404 | # seemed safest just to list the characters. | ||||
2405 | # s<([^\x20\x21\x23\x27-\x3F\x41-\x5B\x5D-\x7E])> | ||||
2406 | s<([^ !"#'()*+,\-./0123456789:;\<=\>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]^_`abcdefghijklmnopqrstuvwxyz{|}~])> | ||||
2407 | <$pretty_form{$1} || '\\x{'.sprintf("%x", ord($1)).'}'>eg; | ||||
2408 | #<$pretty_form{$1} || '\\x'.(unpack("H2",$1))>eg; | ||||
2409 | qq{"$_"}; | ||||
2410 | } | ||||
2411 | } @stuff; | ||||
2412 | # $out =~ s/\n */ /g if length($out) < 75; | ||||
2413 | return $out; | ||||
2414 | } | ||||
2415 | |||||
2416 | #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | ||||
2417 | |||||
2418 | # A rather unsubtle method of blowing away all the state information | ||||
2419 | # from a parser object so it can be reused. Provided as a utility for | ||||
2420 | # backward compatibility in Pod::Man, etc. but not recommended for | ||||
2421 | # general use. | ||||
2422 | |||||
2423 | sub reinit { | ||||
2424 | my $self = shift; | ||||
2425 | foreach (qw(source_dead source_filename doc_has_started | ||||
2426 | start_of_pod_block content_seen last_was_blank paras curr_open | ||||
2427 | line_count pod_para_count in_pod ~tried_gen_errata all_errata errata errors_seen | ||||
2428 | Title)) { | ||||
2429 | |||||
2430 | delete $self->{$_}; | ||||
2431 | } | ||||
2432 | } | ||||
2433 | |||||
2434 | #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | ||||
2435 | 1 | 12µs | 1; | ||
2436 | |||||
# spent 66µs within Pod::Simple::BlackBox::CORE:match which was called 9 times, avg 7µs/call:
# once (20µs+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 37)[Pod/Simple/BlackBox.pm:44]
# once (10µs+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 32)[Pod/Simple/BlackBox.pm:44]
# once (10µs+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 34)[Pod/Simple/BlackBox.pm:44]
# once (7µs+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 41)[Pod/Simple/BlackBox.pm:44]
# once (7µs+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 39)[Pod/Simple/BlackBox.pm:44]
# once (6µs+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 43)[Pod/Simple/BlackBox.pm:44]
# once (5µs+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 30)[Pod/Simple/BlackBox.pm:44]
# once (1µs+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 28)[Pod/Simple/BlackBox.pm:44]
# once (1µs+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 45)[Pod/Simple/BlackBox.pm:44] | |||||
# spent 8µs within Pod::Simple::BlackBox::CORE:qr which was called 10 times, avg 840ns/call:
# once (2µs+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 44)[Pod/Simple/BlackBox.pm:40]
# once (1µs+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 27)[Pod/Simple/BlackBox.pm:40]
# once (700ns+0s) by Pod::Simple::LinkSection::BEGIN@7 at line 2 of (eval 35)[Pod/Simple/BlackBox.pm:74]
# once (700ns+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 31)[Pod/Simple/BlackBox.pm:40]
# once (600ns+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 42)[Pod/Simple/BlackBox.pm:40]
# once (600ns+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 36)[Pod/Simple/BlackBox.pm:40]
# once (600ns+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 29)[Pod/Simple/BlackBox.pm:40]
# once (600ns+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 40)[Pod/Simple/BlackBox.pm:40]
# once (600ns+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 38)[Pod/Simple/BlackBox.pm:40]
# once (600ns+0s) by Pod::Simple::BlackBox::my_qr at line 1 of (eval 33)[Pod/Simple/BlackBox.pm:40] |