1 | #!/usr/bin/env python |
---|
2 | # |
---|
3 | # python script to process makedoc instructions in a source file and produce |
---|
4 | # DocBook XML output |
---|
5 | # |
---|
6 | |
---|
7 | # |
---|
8 | # This performs 3 stages of processing on it's input, in a similar fashion |
---|
9 | # to makedoc: |
---|
10 | # |
---|
11 | # 1. Discard everything outside of /* */ comments |
---|
12 | # 2. Identify lines which contains commands (a single uppercase word) |
---|
13 | # 3. Apply each command to the text of the following lines (up to the next |
---|
14 | # command or the end of the comment block), to produce some output |
---|
15 | # |
---|
16 | # The resulting output contains one or more DocBook XML refentry elements. |
---|
17 | # |
---|
18 | # To make the output a valid XML document which can be xincluded, those refentry |
---|
19 | # elements are contained by a refcontainer element. refcontainer is not part of |
---|
20 | # the DocBook DTD and should be removed by a suitable XSLT. |
---|
21 | # |
---|
22 | |
---|
23 | from __future__ import print_function |
---|
24 | |
---|
25 | import sys |
---|
26 | import re |
---|
27 | from optparse import OptionParser |
---|
28 | import lxml.etree |
---|
29 | import ply.lex as lex |
---|
30 | import ply.yacc as yacc |
---|
31 | |
---|
32 | rootelement = None # root element of the XML tree |
---|
33 | refentry = None # the current refentry |
---|
34 | verbose = 0 |
---|
35 | |
---|
36 | def dump(s, stage, threshold = 1): |
---|
37 | if verbose > threshold: |
---|
38 | print('*' * 40, file=sys.stderr) |
---|
39 | print(stage, file=sys.stderr) |
---|
40 | print('*' * 40, file=sys.stderr) |
---|
41 | print('%s' % s, file=sys.stderr) |
---|
42 | print('*' * 40, file=sys.stderr) |
---|
43 | |
---|
44 | # |
---|
45 | # Stage 1 |
---|
46 | # |
---|
47 | |
---|
48 | def skip_whitespace_and_stars(i, src): |
---|
49 | |
---|
50 | while i < len(src) and (src[i].isspace() or (src[i] == '*' and src[i+1] != '/')): |
---|
51 | i += 1 |
---|
52 | |
---|
53 | return i |
---|
54 | |
---|
55 | # Discard everything not inside '/* */' style-comments which start at column 0 |
---|
56 | # Discard any leading blank space or '*' |
---|
57 | # Discard a single leading '.' |
---|
58 | # Discard blank lines after a blank line |
---|
59 | def comment_contents_generator(src): |
---|
60 | i = 0 |
---|
61 | |
---|
62 | while i < len(src) - 2: |
---|
63 | if src[i] == '\n' and src[i+1] == '/' and src[i+2] == '*': |
---|
64 | i = i + 3 |
---|
65 | |
---|
66 | i = skip_whitespace_and_stars(i, src) |
---|
67 | |
---|
68 | if src[i] == '.': |
---|
69 | i += 1 |
---|
70 | |
---|
71 | while i < len(src): |
---|
72 | if src[i] == '\n': |
---|
73 | yield '\n' |
---|
74 | i += 1 |
---|
75 | |
---|
76 | # allow a single blank line |
---|
77 | if i < len(src) and src[i] == '\n': |
---|
78 | yield '\n' |
---|
79 | i += 1 |
---|
80 | |
---|
81 | i = skip_whitespace_and_stars(i, src) |
---|
82 | |
---|
83 | elif src[i] == '*' and src[i+1] == '/': |
---|
84 | i = i + 2 |
---|
85 | # If we have just output \n\n, this adds another blank line. |
---|
86 | # This is the only way a double blank line can occur. |
---|
87 | yield '\nEND\n' |
---|
88 | break |
---|
89 | else: |
---|
90 | yield src[i] |
---|
91 | i += 1 |
---|
92 | else: |
---|
93 | i += 1 |
---|
94 | |
---|
95 | def remove_noncomments(src): |
---|
96 | src = '\n' + src |
---|
97 | dst = ''.join(comment_contents_generator(src)) |
---|
98 | dump(dst, 'extracted from comments') |
---|
99 | |
---|
100 | return dst |
---|
101 | |
---|
102 | # |
---|
103 | # Stage 2 |
---|
104 | # |
---|
105 | |
---|
106 | # A command is a single word of at least 3 characters, all uppercase, and alone on a line |
---|
107 | def iscommand(l): |
---|
108 | if re.match('^[A-Z_]{3,}\s*$', l): |
---|
109 | |
---|
110 | return True |
---|
111 | return False |
---|
112 | |
---|
113 | def command_block_generator(content): |
---|
114 | command = 'START' |
---|
115 | text = '' |
---|
116 | |
---|
117 | for l in content.splitlines(): |
---|
118 | if iscommand(l): |
---|
119 | yield (command, text) |
---|
120 | command = l.rstrip() |
---|
121 | text = '' |
---|
122 | else: |
---|
123 | text = text + l + '\n' |
---|
124 | yield (command, text) |
---|
125 | |
---|
126 | # Look for commands, which give instructions how to process the following input |
---|
127 | def process(content): |
---|
128 | content = content.lstrip() |
---|
129 | |
---|
130 | dump(content, 'about to process for commands') |
---|
131 | |
---|
132 | # process into a list of tuples of commands and the associated following text |
---|
133 | # it is important to maintain the order of the sections the commands generate |
---|
134 | processed = list(command_block_generator(content)) |
---|
135 | |
---|
136 | return processed |
---|
137 | |
---|
138 | # |
---|
139 | # Stage 3 |
---|
140 | # |
---|
141 | |
---|
142 | # invoke each command on it's text |
---|
143 | def perform(processed): |
---|
144 | for i in processed: |
---|
145 | c = i[0].rstrip() |
---|
146 | t = i[1].strip() + '\n' |
---|
147 | |
---|
148 | if verbose: |
---|
149 | print("performing command '%s'" % c, file=sys.stderr) |
---|
150 | |
---|
151 | if c in command_dispatch_dict: |
---|
152 | command_dispatch_dict[c](c, t) |
---|
153 | else: |
---|
154 | print("command '%s' is not recognized" % c, file=sys.stderr) |
---|
155 | # the text following an unrecognized command is discarded |
---|
156 | |
---|
157 | # FUNCTION (aka TYPEDEF) |
---|
158 | # |
---|
159 | def function(c, l): |
---|
160 | global refentry |
---|
161 | global rootelement |
---|
162 | |
---|
163 | l = l.strip() |
---|
164 | if verbose: |
---|
165 | print('FUNCTION %s' % l, file=sys.stderr) |
---|
166 | |
---|
167 | separator = '---' |
---|
168 | |
---|
169 | if ';' in l: |
---|
170 | # fpclassify has an unusual format we also need to handle |
---|
171 | spliton = ';' |
---|
172 | l = l.splitlines()[0] |
---|
173 | elif len(l.splitlines()) > 1: |
---|
174 | # a few pages like mktemp have two '---' lines |
---|
175 | spliton = ';' |
---|
176 | o = '' |
---|
177 | for i in l.splitlines(): |
---|
178 | if separator in i: |
---|
179 | o += i + ';' |
---|
180 | else: |
---|
181 | o += i |
---|
182 | l = o[:-1] |
---|
183 | else: |
---|
184 | spliton = '\n' |
---|
185 | |
---|
186 | namelist = [] |
---|
187 | descrlist = [] |
---|
188 | for a in l.split(spliton): |
---|
189 | (n, d) = a.split(separator, 1) |
---|
190 | namelist = namelist + n.split(',') |
---|
191 | descrlist = descrlist + [d] |
---|
192 | |
---|
193 | # only copysign and log1p use <[ ]> markup in descr, |
---|
194 | # only gets() uses << >> markup |
---|
195 | # but we should handle it correctly |
---|
196 | descr = line_markup_convert(', '.join(descrlist)) |
---|
197 | |
---|
198 | # fpclassify includes an 'and' we need to discard |
---|
199 | namelist = map(lambda v: re.sub('^and ', '', v.strip(), 1), namelist) |
---|
200 | # strip off << >> surrounding name |
---|
201 | namelist = map(lambda v: v.strip().lstrip('<').rstrip('>'), namelist) |
---|
202 | |
---|
203 | if verbose: |
---|
204 | print(namelist, file=sys.stderr) |
---|
205 | # additional alternate names may also appear in INDEX commands |
---|
206 | |
---|
207 | # create the root element if needed |
---|
208 | if rootelement is None: |
---|
209 | rootelement = lxml.etree.Element('refentrycontainer') |
---|
210 | |
---|
211 | # FUNCTION implies starting a new refentry |
---|
212 | if refentry is not None: |
---|
213 | print("multiple FUNCTIONs without NEWPAGE", file=sys.stderr) |
---|
214 | exit(1) |
---|
215 | |
---|
216 | # create the refentry |
---|
217 | refentry = lxml.etree.SubElement(rootelement, 'refentry') |
---|
218 | refentry.append(lxml.etree.Comment(' Generated by makedocbook.py ')) |
---|
219 | refentry.set('id', namelist[0].lstrip('_')) |
---|
220 | |
---|
221 | refmeta = lxml.etree.SubElement(refentry, 'refmeta') |
---|
222 | # refentrytitle will be same as refdescriptor, the primary name |
---|
223 | refentrytitle = lxml.etree.SubElement(refmeta, 'refentrytitle') |
---|
224 | refentrytitle.text = namelist[0] |
---|
225 | manvolnum = lxml.etree.SubElement(refmeta, 'manvolnum') |
---|
226 | manvolnum.text = '3' |
---|
227 | |
---|
228 | refnamediv = lxml.etree.SubElement(refentry, 'refnamediv') |
---|
229 | # refdescriptor is the primary name, assume we should use the one which |
---|
230 | # appears first in the list |
---|
231 | refdescriptor = lxml.etree.SubElement(refnamediv, 'refdescriptor') |
---|
232 | refdescriptor.text = namelist[0] |
---|
233 | # refname elements exist for all alternate names |
---|
234 | for n in namelist: |
---|
235 | refname = lxml.etree.SubElement(refnamediv, 'refname') |
---|
236 | refname.text = n |
---|
237 | refpurpose = lxml.etree.SubElement(refnamediv, 'refpurpose') |
---|
238 | refnamediv.replace(refpurpose, lxml.etree.fromstring('<refpurpose>' + descr + '</refpurpose>')) |
---|
239 | |
---|
240 | # Only FUNCTION currently exists, which implies that the SYNOPSIS should be |
---|
241 | # a funcsynopsis. If TYPEDEF was to be added, SYNOPSIS should be processed |
---|
242 | # in a different way, probably producing a refsynopsis. |
---|
243 | |
---|
244 | # INDEX |
---|
245 | # may occur more than once for each FUNCTION giving alternate names this |
---|
246 | # function should be indexed under |
---|
247 | # |
---|
248 | def index(c, l): |
---|
249 | l = l.strip() |
---|
250 | |
---|
251 | if verbose: |
---|
252 | print('INDEX %s' % l, file=sys.stderr) |
---|
253 | |
---|
254 | # discard anything after the first word |
---|
255 | l = l.split()[0] |
---|
256 | |
---|
257 | # add indexterm |
---|
258 | # (we could just index under all the refnames, but we control the indexing |
---|
259 | # separately as that is what makedoc does) |
---|
260 | indexterm = lxml.etree.SubElement(refentry, 'indexterm') |
---|
261 | primary = lxml.etree.SubElement(indexterm, 'primary') |
---|
262 | primary.text = l |
---|
263 | |
---|
264 | # to validate, it seems we need to maintain refentry elements in a certain order |
---|
265 | refentry[:] = sorted(refentry, key = lambda x: x.tag) |
---|
266 | |
---|
267 | # adds another alternate refname |
---|
268 | refnamediv = refentry.find('refnamediv') |
---|
269 | |
---|
270 | # as long as it doesn't already exist |
---|
271 | if not refnamediv.xpath(('refname[.="%s"]') % l): |
---|
272 | refname = lxml.etree.SubElement(refnamediv, 'refname') |
---|
273 | refname.text = l |
---|
274 | if verbose > 1: |
---|
275 | print('added refname %s' % l, file=sys.stderr) |
---|
276 | else: |
---|
277 | if verbose > 1: |
---|
278 | print('duplicate refname %s discarded' % l, file=sys.stderr) |
---|
279 | |
---|
280 | # to validate, it seems we need to maintain refnamediv elements in a certain order |
---|
281 | refnamediv[:] = sorted(refnamediv, key = lambda x: x.tag) |
---|
282 | |
---|
283 | |
---|
284 | # SYNOPSIS aka ANSI_SYNOPSIS |
---|
285 | # ANSI-style synopsis |
---|
286 | # |
---|
287 | # Note that makedoc would also process <<code>> markup here, but there are no |
---|
288 | # such uses. |
---|
289 | # |
---|
290 | def synopsis(c, t): |
---|
291 | refsynopsisdiv = lxml.etree.SubElement(refentry, 'refsynopsisdiv') |
---|
292 | funcsynopsis = lxml.etree.SubElement(refsynopsisdiv, 'funcsynopsis') |
---|
293 | |
---|
294 | s = '' |
---|
295 | for l in t.splitlines(): |
---|
296 | if re.match('\s*(#|\[|struct)', l): |
---|
297 | # preprocessor # directives, structs, comments in square brackets |
---|
298 | funcsynopsisinfo = lxml.etree.SubElement(funcsynopsis, 'funcsynopsisinfo') |
---|
299 | funcsynopsisinfo.text = l.strip() + '\n' |
---|
300 | else: |
---|
301 | s = s + l |
---|
302 | |
---|
303 | # a prototype without a terminating ';' is an error |
---|
304 | if s.endswith(')'): |
---|
305 | print("'%s' missing terminating semicolon" % l, file=sys.stderr) |
---|
306 | s = s + ';' |
---|
307 | exit(1) |
---|
308 | |
---|
309 | if ';' in s: |
---|
310 | synopsis_for_prototype(funcsynopsis, s) |
---|
311 | s = '' |
---|
312 | |
---|
313 | if s.strip(): |
---|
314 | print("surplus synopsis '%s'" % s, file=sys.stderr) |
---|
315 | raise |
---|
316 | |
---|
317 | def synopsis_for_prototype(funcsynopsis, s): |
---|
318 | s = s.strip() |
---|
319 | |
---|
320 | # funcsynopsis has a very detailed content model, so we need to massage the |
---|
321 | # bare prototype into it. Fortunately, since the parameter names are marked |
---|
322 | # up, we have enough information to do this. |
---|
323 | for fp in s.split(';'): |
---|
324 | fp = fp.strip() |
---|
325 | if fp: |
---|
326 | |
---|
327 | if verbose: |
---|
328 | print("'%s'" % fp, file=sys.stderr) |
---|
329 | |
---|
330 | match = re.match(r'(.*?)([\w\d]*) ?\((.*)\)', fp) |
---|
331 | |
---|
332 | if verbose: |
---|
333 | print(match.groups(), file=sys.stderr) |
---|
334 | |
---|
335 | funcprototype = lxml.etree.SubElement(funcsynopsis, 'funcprototype') |
---|
336 | funcdef = lxml.etree.SubElement(funcprototype, 'funcdef') |
---|
337 | funcdef.text = match.group(1) |
---|
338 | function = lxml.etree.SubElement(funcdef, 'function') |
---|
339 | function.text = match.group(2) |
---|
340 | |
---|
341 | if match.group(3).strip() == 'void': |
---|
342 | void = lxml.etree.SubElement(funcprototype, 'void') |
---|
343 | else: |
---|
344 | # Split parameters on ',' except if it is inside () |
---|
345 | for p in re.split(',(?![^()]*\))', match.group(3)): |
---|
346 | p = p.strip() |
---|
347 | |
---|
348 | if verbose: |
---|
349 | print(p, file=sys.stderr) |
---|
350 | |
---|
351 | if p == '...': |
---|
352 | varargs = lxml.etree.SubElement(funcprototype, 'varargs') |
---|
353 | else: |
---|
354 | paramdef = lxml.etree.SubElement(funcprototype, 'paramdef') |
---|
355 | parameter = lxml.etree.SubElement(paramdef, 'parameter') |
---|
356 | |
---|
357 | # <[ ]> enclose the parameter name |
---|
358 | match2 = re.match('(.*)<\[(.*)\]>(.*)', p) |
---|
359 | |
---|
360 | if verbose: |
---|
361 | print(match2.groups(), file=sys.stderr) |
---|
362 | |
---|
363 | paramdef.text = match2.group(1) |
---|
364 | parameter.text = match2.group(2) |
---|
365 | parameter.tail = match2.group(3) |
---|
366 | |
---|
367 | |
---|
368 | # DESCRIPTION |
---|
369 | # (RETURNS, ERRORS, PORTABILITY, BUGS, WARNINGS, SEEALSO, NOTES are handled the same) |
---|
370 | # |
---|
371 | # Create a refsect with a title corresponding to the command |
---|
372 | # |
---|
373 | # Nearly all the the existing DESCRIPTION contents could be transformed into |
---|
374 | # DocBook with a few regex substitutions. Unfortunately, pages like sprintf and |
---|
375 | # sscanf, have very complex layout using nested tables and itemized lists, which |
---|
376 | # it is best to parse in order to transform correctly. |
---|
377 | # |
---|
378 | |
---|
379 | def refsect(t, s): |
---|
380 | refsect = lxml.etree.SubElement(refentry, 'refsect1') |
---|
381 | title = lxml.etree.SubElement(refsect, 'title') |
---|
382 | title.text = t.title() |
---|
383 | |
---|
384 | if verbose: |
---|
385 | print('%s has %d paragraphs' % (t, len(s.split('\n\n'))) , file=sys.stderr) |
---|
386 | |
---|
387 | if verbose > 1: |
---|
388 | dump(s, 'before lexing') |
---|
389 | |
---|
390 | # dump out lexer token sequence |
---|
391 | lex.input(s) |
---|
392 | for tok in lexer: |
---|
393 | print(tok, file=sys.stderr) |
---|
394 | |
---|
395 | # parse the section text for makedoc markup and the few pieces of texinfo |
---|
396 | # markup we understand, and output an XML marked-up string |
---|
397 | xml = parser.parse(s, tracking=True, debug=(verbose > 2)) |
---|
398 | |
---|
399 | dump(xml, 'after parsing') |
---|
400 | |
---|
401 | xml = '<refsect1>' + xml + '</refsect1>' |
---|
402 | |
---|
403 | refsect.extend(lxml.etree.fromstring(xml)) |
---|
404 | |
---|
405 | def seealso(c, t): |
---|
406 | refsect('SEE ALSO', t) |
---|
407 | |
---|
408 | # NEWPAGE |
---|
409 | # |
---|
410 | # start a new refentry |
---|
411 | |
---|
412 | def newpage(c, t): |
---|
413 | global refentry |
---|
414 | refentry = None |
---|
415 | |
---|
416 | # command dispatch table |
---|
417 | |
---|
418 | def discarded(c, t): |
---|
419 | return |
---|
420 | |
---|
421 | command_dispatch_dict = { |
---|
422 | 'FUNCTION' : function, |
---|
423 | 'TYPEDEF' : function, # TYPEDEF is not currently used, but described in doc.str |
---|
424 | 'INDEX' : index, |
---|
425 | 'TRAD_SYNOPSIS' : discarded, # K&R-style synopsis, obsolete and discarded |
---|
426 | 'ANSI_SYNOPSIS' : synopsis, |
---|
427 | 'SYNOPSIS' : synopsis, |
---|
428 | 'DESCRIPTION' : refsect, |
---|
429 | 'RETURNS' : refsect, |
---|
430 | 'ERRORS' : refsect, |
---|
431 | 'PORTABILITY' : refsect, |
---|
432 | 'BUGS' : refsect, |
---|
433 | 'WARNINGS' : refsect, |
---|
434 | 'SEEALSO' : seealso, |
---|
435 | 'NOTES' : refsect, # NOTES is not described in doc.str, so is currently discarded by makedoc, but that doesn't seem right |
---|
436 | 'QUICKREF' : discarded, # The intent of QUICKREF and MATHREF is not obvious, but they don't generate any output currently |
---|
437 | 'MATHREF' : discarded, |
---|
438 | 'START' : discarded, # a START command is inserted to contain the text before the first command |
---|
439 | 'END' : discarded, # an END command is inserted merely to terminate the text for the last command in a comment block |
---|
440 | 'NEWPAGE' : newpage, |
---|
441 | } |
---|
442 | |
---|
443 | # |
---|
444 | # Utility functions |
---|
445 | # |
---|
446 | |
---|
447 | # apply transformations which are easy to do in-place |
---|
448 | def line_markup_convert(p): |
---|
449 | s = p; |
---|
450 | |
---|
451 | # process the texinfo escape for an @ |
---|
452 | s = s.replace('@@', '@') |
---|
453 | |
---|
454 | # escape characters not allowed in XML |
---|
455 | s = s.replace('&','&') |
---|
456 | s = s.replace('<','<') |
---|
457 | s = s.replace('>','>') |
---|
458 | |
---|
459 | # convert <<somecode>> to <code>somecode</code> and <[var]> to |
---|
460 | # <varname>var</varname> |
---|
461 | # also handle nested << <[ ]> >> correctly |
---|
462 | s = s.replace('<<','<code>') |
---|
463 | s = s.replace('<[','<varname>') |
---|
464 | s = s.replace(']>','</varname>') |
---|
465 | s = s.replace('>>','</code>') |
---|
466 | |
---|
467 | # also convert some simple texinfo markup |
---|
468 | # convert @emph{foo} to <emphasis>foo</emphasis> |
---|
469 | s = re.sub('@emph{(.*?)}', '<emphasis>\\1</emphasis>', s) |
---|
470 | # convert @strong{foo} to <emphasis role=strong>foo</emphasis> |
---|
471 | s = re.sub('@strong{(.*?)}', '<emphasis role="strong">\\1</emphasis>', s) |
---|
472 | # convert @minus{} to U+2212 MINUS SIGN |
---|
473 | s = s.replace('@minus{}', '−') |
---|
474 | # convert @dots{} to U+2026 HORIZONTAL ELLIPSIS |
---|
475 | s = s.replace('@dots{}', '…') |
---|
476 | |
---|
477 | # convert xref and pxref |
---|
478 | s = re.sub('@xref{(.*?)}', "See <xref linkend='\\1'/>", s) |
---|
479 | |
---|
480 | # very hacky way of dealing with @* to force a newline |
---|
481 | s = s.replace('@*', '</para><para>') |
---|
482 | |
---|
483 | if (verbose > 3) and (s != p): |
---|
484 | print('%s-> line_markup_convert ->\n%s' % (p, s), file=sys.stderr) |
---|
485 | |
---|
486 | return s |
---|
487 | |
---|
488 | # |
---|
489 | # lexer |
---|
490 | # |
---|
491 | |
---|
492 | texinfo_commands = { |
---|
493 | 'ifnottex' : 'IFNOTTEX', |
---|
494 | 'end ifnottex' : 'ENDIFNOTTEX', |
---|
495 | 'tex' : 'IFTEX', |
---|
496 | 'end tex' : 'ENDIFTEX', |
---|
497 | 'comment' : 'COMMENT', |
---|
498 | 'c ' : 'COMMENT', |
---|
499 | 'multitable' : 'MULTICOLUMNTABLE', |
---|
500 | 'end multitable' : 'ENDMULTICOLUMNTABLE', |
---|
501 | 'headitem' : 'MCT_HEADITEM', |
---|
502 | 'tab' : 'MCT_COLUMN_SEPARATOR', |
---|
503 | 'item' : 'MCT_ITEM', |
---|
504 | } |
---|
505 | |
---|
506 | # token names |
---|
507 | tokens = [ |
---|
508 | 'BLANKLINE', |
---|
509 | 'BULLETEND', |
---|
510 | 'BULLETSTART', |
---|
511 | 'COURIER', |
---|
512 | 'EOF', |
---|
513 | 'ITEM', |
---|
514 | 'TABLEEND', |
---|
515 | 'TABLESTART', |
---|
516 | 'TEXINFO', |
---|
517 | 'TEXT', |
---|
518 | ] + list(set(texinfo_commands.values())) |
---|
519 | |
---|
520 | # regular expression rules for tokens, in priority order |
---|
521 | # (all these expressions should match a whole line) |
---|
522 | def t_TEXINFO(t): |
---|
523 | # this matches any @command. but not @command{} which just happens to be at |
---|
524 | # the start of a line |
---|
525 | r'@\w+[^{]*?\n' |
---|
526 | |
---|
527 | # if the line starts with a known texinfo command, change t.type to the |
---|
528 | # token for that command |
---|
529 | for k in texinfo_commands.keys(): |
---|
530 | if t.value[1:].startswith(k): |
---|
531 | t.type = texinfo_commands[k] |
---|
532 | break |
---|
533 | |
---|
534 | return t |
---|
535 | |
---|
536 | def t_COURIER(t): |
---|
537 | r'[.|].*\n' |
---|
538 | t.value = line_markup_convert(t.value[1:]) |
---|
539 | return t |
---|
540 | |
---|
541 | def t_BULLETSTART(t): |
---|
542 | r'O\+\n' |
---|
543 | return t |
---|
544 | |
---|
545 | def t_BULLETEND(t): |
---|
546 | r'O-\n' |
---|
547 | return t |
---|
548 | |
---|
549 | def t_TABLESTART(t): |
---|
550 | r'o\+\n' |
---|
551 | return t |
---|
552 | |
---|
553 | def t_TABLEEND(t): |
---|
554 | r'o-\n' |
---|
555 | return t |
---|
556 | |
---|
557 | def t_ITEM(t): |
---|
558 | r'o\s.*\n' |
---|
559 | t.value = re.sub('o\s', '', lexer.lexmatch.group(0), 1) |
---|
560 | t.value = line_markup_convert(t.value) |
---|
561 | return t |
---|
562 | |
---|
563 | def t_TEXT(t): |
---|
564 | r'.+\n' |
---|
565 | t.value = line_markup_convert(t.value) |
---|
566 | t.lexer.lineno += 1 |
---|
567 | return t |
---|
568 | |
---|
569 | def t_BLANKLINE(t): |
---|
570 | r'\n' |
---|
571 | t.lexer.lineno += 1 |
---|
572 | return t |
---|
573 | |
---|
574 | def t_eof(t): |
---|
575 | if hasattr(t.lexer,'at_eof'): |
---|
576 | # remove eof flag ready for lexing next input |
---|
577 | delattr(t.lexer,'at_eof') |
---|
578 | t.lexer.lineno = 0 |
---|
579 | return None |
---|
580 | |
---|
581 | t.type = 'EOF' |
---|
582 | t.lexer.at_eof = True; |
---|
583 | |
---|
584 | return t |
---|
585 | |
---|
586 | # Error handling rule |
---|
587 | def t_error(t): |
---|
588 | print("tokenization error, remaining text '%s'" % t.value, file=sys.stderr) |
---|
589 | exit(1) |
---|
590 | |
---|
591 | lexer = lex.lex() |
---|
592 | |
---|
593 | # |
---|
594 | # parser |
---|
595 | # |
---|
596 | |
---|
597 | def parser_verbose(p): |
---|
598 | if verbose > 2: |
---|
599 | print(p[0], file=sys.stderr) |
---|
600 | |
---|
601 | def p_input(p): |
---|
602 | '''input : paragraph |
---|
603 | | input paragraph''' |
---|
604 | if len(p) == 3: |
---|
605 | p[0] = p[1] + '\n' + p[2] |
---|
606 | else: |
---|
607 | p[0] = p[1] |
---|
608 | parser_verbose(p) |
---|
609 | |
---|
610 | # Strictly, text at top level should be paragraphs (i.e terminated by a |
---|
611 | # BLANKLINE), while text contained in rows or bullets may not be, but this |
---|
612 | # grammar doesn't enforce that for simplicity's sake. |
---|
613 | def p_paragraph(p): |
---|
614 | '''paragraph : paragraph_content maybe_eof_or_blankline''' |
---|
615 | p[0] = '<para>\n' + p[1] + '</para>' |
---|
616 | parser_verbose(p) |
---|
617 | |
---|
618 | def p_paragraph_content(p): |
---|
619 | '''paragraph_content : paragraph_line |
---|
620 | | paragraph_line paragraph_content''' |
---|
621 | if len(p) == 3: |
---|
622 | p[0] = p[1] + p[2] |
---|
623 | else: |
---|
624 | p[0] = p[1] |
---|
625 | parser_verbose(p) |
---|
626 | |
---|
627 | def p_paragraph_line(p): |
---|
628 | '''paragraph_line : TEXT |
---|
629 | | texinfocmd |
---|
630 | | courierblock |
---|
631 | | table |
---|
632 | | bulletlist''' |
---|
633 | p[0] = p[1] |
---|
634 | |
---|
635 | def p_empty(p): |
---|
636 | 'empty :' |
---|
637 | p[0] = '' |
---|
638 | |
---|
639 | def p_maybe_eof_or_blankline(p): |
---|
640 | '''maybe_eof_or_blankline : empty |
---|
641 | | EOF |
---|
642 | | BLANKLINE |
---|
643 | | BLANKLINE EOF''' |
---|
644 | p[0] = '' |
---|
645 | |
---|
646 | def p_maybe_lines(p): |
---|
647 | '''maybe_lines : empty |
---|
648 | | paragraph maybe_lines''' |
---|
649 | if len(p) == 3: |
---|
650 | p[0] = p[1] + p[2] |
---|
651 | else: |
---|
652 | p[0] = p[1] |
---|
653 | parser_verbose(p) |
---|
654 | |
---|
655 | def p_maybe_blankline(p): |
---|
656 | '''maybe_blankline : empty |
---|
657 | | BLANKLINE''' |
---|
658 | p[0] = '' |
---|
659 | |
---|
660 | def p_courierblock(p): |
---|
661 | '''courierblock : courier''' |
---|
662 | p[0] = '<literallayout class="monospaced">' + p[1] + '</literallayout>' |
---|
663 | parser_verbose(p) |
---|
664 | |
---|
665 | def p_courier(p): |
---|
666 | '''courier : COURIER |
---|
667 | | COURIER courier''' |
---|
668 | if len(p) == 3: |
---|
669 | p[0] = p[1] + p[2] |
---|
670 | else: |
---|
671 | p[0] = p[1] |
---|
672 | parser_verbose(p) |
---|
673 | |
---|
674 | def p_bullet(p): |
---|
675 | '''bullet : ITEM maybe_lines |
---|
676 | | ITEM BLANKLINE maybe_lines''' |
---|
677 | if len(p) == 3: |
---|
678 | # Glue any text in ITEM into the first para of maybe_lines |
---|
679 | # (This is an unfortunate consequence of the line-based tokenization we do) |
---|
680 | if p[2].startswith('<para>'): |
---|
681 | p[0] = '<listitem><para>' + p[1] + p[2][len('<para>'):] + '</listitem>' |
---|
682 | else: |
---|
683 | p[0] = '<listitem><para>' + p[1] + '</para>' + p[2] + '</listitem>' |
---|
684 | else: |
---|
685 | p[0] = '<listitem><para>' + p[1] + '</para>' + p[3] + '</listitem>' |
---|
686 | parser_verbose(p) |
---|
687 | |
---|
688 | def p_bullets(p): |
---|
689 | '''bullets : bullet |
---|
690 | | bullet bullets''' |
---|
691 | if len(p) == 3: |
---|
692 | p[0] = p[1] + '\n' + p[2] |
---|
693 | else: |
---|
694 | p[0] = p[1] |
---|
695 | parser_verbose(p) |
---|
696 | |
---|
697 | def p_bulletlist(p): |
---|
698 | '''bulletlist : BULLETSTART bullets BULLETEND maybe_blankline''' |
---|
699 | p[0] = '<itemizedlist>' + p[2] + '</itemizedlist>' |
---|
700 | parser_verbose(p) |
---|
701 | |
---|
702 | def p_row(p): |
---|
703 | '''row : ITEM maybe_lines |
---|
704 | | ITEM BLANKLINE maybe_lines''' |
---|
705 | if len(p) == 3: |
---|
706 | p[0] = '<row><entry><code>' + p[1] + '</code></entry><entry>' + p[2] + '</entry></row>' |
---|
707 | else: |
---|
708 | p[0] = '<row><entry><code>' + p[1] + '</code></entry><entry>' + p[3] + '</entry></row>' |
---|
709 | parser_verbose(p) |
---|
710 | |
---|
711 | def p_rows(p): |
---|
712 | '''rows : row |
---|
713 | | row rows''' |
---|
714 | if len(p) == 3: |
---|
715 | p[0] = p[1] + '\n' + p[2] |
---|
716 | else: |
---|
717 | p[0] = p[1] |
---|
718 | parser_verbose(p) |
---|
719 | |
---|
720 | def p_table(p): |
---|
721 | '''table : TABLESTART rows TABLEEND maybe_blankline''' |
---|
722 | p[0] = '<informaltable><tgroup cols="2"><tbody>' + p[2] + '</tbody></tgroup></informaltable>' |
---|
723 | parser_verbose(p) |
---|
724 | |
---|
725 | def p_texinfocmd(p): |
---|
726 | '''texinfocmd : unknown_texinfocmd |
---|
727 | | comment |
---|
728 | | multitable |
---|
729 | | nottex |
---|
730 | | tex''' |
---|
731 | p[0] = p[1] |
---|
732 | |
---|
733 | def p_unknown_texinfocmd(p): |
---|
734 | '''unknown_texinfocmd : TEXINFO''' |
---|
735 | print("unknown texinfo command '%s'" % p[1].strip(), file=sys.stderr) |
---|
736 | p[0] = p[1] |
---|
737 | parser_verbose(p) |
---|
738 | |
---|
739 | def p_nottex(p): |
---|
740 | '''nottex : IFNOTTEX paragraph_content ENDIFNOTTEX''' |
---|
741 | p[0] = p[2] |
---|
742 | |
---|
743 | def p_tex(p): |
---|
744 | '''tex : IFTEX paragraph_content ENDIFTEX''' |
---|
745 | # text for TeX formatter inside @iftex is discarded |
---|
746 | p[0] = '' |
---|
747 | |
---|
748 | def p_comment(p): |
---|
749 | '''comment : COMMENT''' |
---|
750 | # comment text is discarded |
---|
751 | p[0] = '' |
---|
752 | |
---|
753 | def p_mct_columns(p): |
---|
754 | '''mct_columns : maybe_lines |
---|
755 | | maybe_lines MCT_COLUMN_SEPARATOR mct_columns''' |
---|
756 | if len(p) == 4: |
---|
757 | p[0] = '<entry>' + p[1] + '</entry>' + p[3] |
---|
758 | else: |
---|
759 | p[0] = '<entry>' + p[1] + '</entry>' |
---|
760 | parser_verbose(p) |
---|
761 | |
---|
762 | def p_mct_row(p): |
---|
763 | '''mct_row : MCT_ITEM mct_columns''' |
---|
764 | p[0] = '<row>' + p[2] + '</row>' |
---|
765 | parser_verbose(p) |
---|
766 | |
---|
767 | def p_mct_rows(p): |
---|
768 | '''mct_rows : mct_row |
---|
769 | | mct_row mct_rows''' |
---|
770 | if len(p) == 3: |
---|
771 | p[0] = p[1] + '\n' + p[2] |
---|
772 | else: |
---|
773 | p[0] = p[1] |
---|
774 | parser_verbose(p) |
---|
775 | |
---|
776 | def p_mct_header(p): |
---|
777 | '''mct_header : MCT_HEADITEM mct_columns''' |
---|
778 | p[0] = '<row>' + p[2] + '</row>' |
---|
779 | parser_verbose(p) |
---|
780 | |
---|
781 | def p_multitable(p): |
---|
782 | '''multitable : MULTICOLUMNTABLE mct_header mct_rows ENDMULTICOLUMNTABLE''' |
---|
783 | # this doesn't handle the prototype row form of @multitable, only the @columnfractions form |
---|
784 | colfrac = p[1].replace('@multitable @columnfractions', '').split() |
---|
785 | colspec = '\n'.join(['<colspec colwidth="%s*"/>' % (c) for c in colfrac]) |
---|
786 | header = '<thead>' + p[2] + '</thead>\n' |
---|
787 | body = '<tbody>' + p[3] + '</tbody>\n' |
---|
788 | p[0] = '<informaltable><tgroup cols="' + str(len(colfrac)) +'">' + colspec + header + body + '</tgroup></informaltable>' |
---|
789 | parser_verbose(p) |
---|
790 | |
---|
791 | def p_error(t): |
---|
792 | print('parse error at line %d, token %s, next token %s' % (t.lineno, t, parser.token()), file=sys.stderr) |
---|
793 | exit(1) |
---|
794 | |
---|
795 | parser = yacc.yacc(start='input') |
---|
796 | |
---|
797 | # |
---|
798 | # |
---|
799 | # |
---|
800 | |
---|
801 | def main(file): |
---|
802 | content = file.read() |
---|
803 | content = remove_noncomments(content) |
---|
804 | processed = process(content) |
---|
805 | perform(processed) |
---|
806 | |
---|
807 | # output the XML tree |
---|
808 | s = lxml.etree.tostring(rootelement, pretty_print=True) |
---|
809 | |
---|
810 | if not s: |
---|
811 | print('No output produced (perhaps the input has no makedoc markup?)', file=sys.stderr) |
---|
812 | exit(1) |
---|
813 | |
---|
814 | print(s) |
---|
815 | |
---|
816 | # warn about texinfo commands which didn't get processed |
---|
817 | match = re.search('@[a-z*]+', s) |
---|
818 | if match: |
---|
819 | print('texinfo command %s remains in output' % match.group(), file=sys.stderr) |
---|
820 | |
---|
821 | # |
---|
822 | # |
---|
823 | # |
---|
824 | |
---|
825 | if __name__ == '__main__' : |
---|
826 | options = OptionParser() |
---|
827 | options.add_option('-v', '--verbose', action='count', dest = 'verbose') |
---|
828 | options.add_option('-c', '--cache', action='store_true', dest = 'cache', help="just ensure PLY cache is up to date") |
---|
829 | (opts, args) = options.parse_args() |
---|
830 | |
---|
831 | if opts.cache: |
---|
832 | sys.exit() |
---|
833 | |
---|
834 | verbose = opts.verbose |
---|
835 | |
---|
836 | if len(args) > 0: |
---|
837 | main(open(args[0], 'rb')) |
---|
838 | else: |
---|
839 | main(sys.stdin) |
---|