[444] | 1 | #!/usr/bin/env python |
---|
| 2 | # |
---|
| 3 | # python script to process makedoc instructions in a source file and produce |
---|
| 4 | # DocBook XML output |
---|
| 5 | # |
---|
| 6 | |
---|
| 7 | # |
---|
| 8 | # This performs 3 stages of processing on it's input, in a similar fashion |
---|
| 9 | # to makedoc: |
---|
| 10 | # |
---|
| 11 | # 1. Discard everything outside of /* */ comments |
---|
| 12 | # 2. Identify lines which contains commands (a single uppercase word) |
---|
| 13 | # 3. Apply each command to the text of the following lines (up to the next |
---|
| 14 | # command or the end of the comment block), to produce some output |
---|
| 15 | # |
---|
| 16 | # The resulting output contains one or more DocBook XML refentry elements. |
---|
| 17 | # |
---|
| 18 | # To make the output a valid XML document which can be xincluded, those refentry |
---|
| 19 | # elements are contained by a refcontainer element. refcontainer is not part of |
---|
| 20 | # the DocBook DTD and should be removed by a suitable XSLT. |
---|
| 21 | # |
---|
| 22 | |
---|
| 23 | from __future__ import print_function |
---|
| 24 | |
---|
| 25 | import sys |
---|
| 26 | import re |
---|
| 27 | from optparse import OptionParser |
---|
| 28 | import lxml.etree |
---|
| 29 | import ply.lex as lex |
---|
| 30 | import ply.yacc as yacc |
---|
| 31 | |
---|
| 32 | rootelement = None # root element of the XML tree |
---|
| 33 | refentry = None # the current refentry |
---|
| 34 | verbose = 0 |
---|
| 35 | |
---|
| 36 | def dump(s, stage, threshold = 1): |
---|
| 37 | if verbose > threshold: |
---|
| 38 | print('*' * 40, file=sys.stderr) |
---|
| 39 | print(stage, file=sys.stderr) |
---|
| 40 | print('*' * 40, file=sys.stderr) |
---|
| 41 | print('%s' % s, file=sys.stderr) |
---|
| 42 | print('*' * 40, file=sys.stderr) |
---|
| 43 | |
---|
| 44 | # |
---|
| 45 | # Stage 1 |
---|
| 46 | # |
---|
| 47 | |
---|
| 48 | def skip_whitespace_and_stars(i, src): |
---|
| 49 | |
---|
| 50 | while i < len(src) and (src[i].isspace() or (src[i] == '*' and src[i+1] != '/')): |
---|
| 51 | i += 1 |
---|
| 52 | |
---|
| 53 | return i |
---|
| 54 | |
---|
| 55 | # Discard everything not inside '/* */' style-comments which start at column 0 |
---|
| 56 | # Discard any leading blank space or '*' |
---|
| 57 | # Discard a single leading '.' |
---|
| 58 | # Discard blank lines after a blank line |
---|
| 59 | def comment_contents_generator(src): |
---|
| 60 | i = 0 |
---|
| 61 | |
---|
| 62 | while i < len(src) - 2: |
---|
| 63 | if src[i] == '\n' and src[i+1] == '/' and src[i+2] == '*': |
---|
| 64 | i = i + 3 |
---|
| 65 | |
---|
| 66 | i = skip_whitespace_and_stars(i, src) |
---|
| 67 | |
---|
| 68 | if src[i] == '.': |
---|
| 69 | i += 1 |
---|
| 70 | |
---|
| 71 | while i < len(src): |
---|
| 72 | if src[i] == '\n': |
---|
| 73 | yield '\n' |
---|
| 74 | i += 1 |
---|
| 75 | |
---|
| 76 | # allow a single blank line |
---|
| 77 | if i < len(src) and src[i] == '\n': |
---|
| 78 | yield '\n' |
---|
| 79 | i += 1 |
---|
| 80 | |
---|
| 81 | i = skip_whitespace_and_stars(i, src) |
---|
| 82 | |
---|
| 83 | elif src[i] == '*' and src[i+1] == '/': |
---|
| 84 | i = i + 2 |
---|
| 85 | # If we have just output \n\n, this adds another blank line. |
---|
| 86 | # This is the only way a double blank line can occur. |
---|
| 87 | yield '\nEND\n' |
---|
| 88 | break |
---|
| 89 | else: |
---|
| 90 | yield src[i] |
---|
| 91 | i += 1 |
---|
| 92 | else: |
---|
| 93 | i += 1 |
---|
| 94 | |
---|
| 95 | def remove_noncomments(src): |
---|
| 96 | src = '\n' + src |
---|
| 97 | dst = ''.join(comment_contents_generator(src)) |
---|
| 98 | dump(dst, 'extracted from comments') |
---|
| 99 | |
---|
| 100 | return dst |
---|
| 101 | |
---|
| 102 | # |
---|
| 103 | # Stage 2 |
---|
| 104 | # |
---|
| 105 | |
---|
| 106 | # A command is a single word of at least 3 characters, all uppercase, and alone on a line |
---|
| 107 | def iscommand(l): |
---|
| 108 | if re.match('^[A-Z_]{3,}\s*$', l): |
---|
| 109 | |
---|
| 110 | return True |
---|
| 111 | return False |
---|
| 112 | |
---|
| 113 | def command_block_generator(content): |
---|
| 114 | command = 'START' |
---|
| 115 | text = '' |
---|
| 116 | |
---|
| 117 | for l in content.splitlines(): |
---|
| 118 | if iscommand(l): |
---|
| 119 | yield (command, text) |
---|
| 120 | command = l.rstrip() |
---|
| 121 | text = '' |
---|
| 122 | else: |
---|
| 123 | text = text + l + '\n' |
---|
| 124 | yield (command, text) |
---|
| 125 | |
---|
| 126 | # Look for commands, which give instructions how to process the following input |
---|
| 127 | def process(content): |
---|
| 128 | content = content.lstrip() |
---|
| 129 | |
---|
| 130 | dump(content, 'about to process for commands') |
---|
| 131 | |
---|
| 132 | # process into a list of tuples of commands and the associated following text |
---|
| 133 | # it is important to maintain the order of the sections the commands generate |
---|
| 134 | processed = list(command_block_generator(content)) |
---|
| 135 | |
---|
| 136 | return processed |
---|
| 137 | |
---|
| 138 | # |
---|
| 139 | # Stage 3 |
---|
| 140 | # |
---|
| 141 | |
---|
| 142 | # invoke each command on it's text |
---|
| 143 | def perform(processed): |
---|
| 144 | for i in processed: |
---|
| 145 | c = i[0].rstrip() |
---|
| 146 | t = i[1].strip() + '\n' |
---|
| 147 | |
---|
| 148 | if verbose: |
---|
| 149 | print("performing command '%s'" % c, file=sys.stderr) |
---|
| 150 | |
---|
| 151 | if c in command_dispatch_dict: |
---|
| 152 | command_dispatch_dict[c](c, t) |
---|
| 153 | else: |
---|
| 154 | print("command '%s' is not recognized" % c, file=sys.stderr) |
---|
| 155 | # the text following an unrecognized command is discarded |
---|
| 156 | |
---|
| 157 | # FUNCTION (aka TYPEDEF) |
---|
| 158 | # |
---|
| 159 | def function(c, l): |
---|
| 160 | global refentry |
---|
| 161 | global rootelement |
---|
| 162 | |
---|
| 163 | l = l.strip() |
---|
| 164 | if verbose: |
---|
| 165 | print('FUNCTION %s' % l, file=sys.stderr) |
---|
| 166 | |
---|
| 167 | separator = '---' |
---|
| 168 | |
---|
| 169 | if ';' in l: |
---|
| 170 | # fpclassify has an unusual format we also need to handle |
---|
| 171 | spliton = ';' |
---|
| 172 | l = l.splitlines()[0] |
---|
| 173 | elif len(l.splitlines()) > 1: |
---|
| 174 | # a few pages like mktemp have two '---' lines |
---|
| 175 | spliton = ';' |
---|
| 176 | o = '' |
---|
| 177 | for i in l.splitlines(): |
---|
| 178 | if separator in i: |
---|
| 179 | o += i + ';' |
---|
| 180 | else: |
---|
| 181 | o += i |
---|
| 182 | l = o[:-1] |
---|
| 183 | else: |
---|
| 184 | spliton = '\n' |
---|
| 185 | |
---|
| 186 | namelist = [] |
---|
| 187 | descrlist = [] |
---|
| 188 | for a in l.split(spliton): |
---|
| 189 | (n, d) = a.split(separator, 1) |
---|
| 190 | namelist = namelist + n.split(',') |
---|
| 191 | descrlist = descrlist + [d] |
---|
| 192 | |
---|
| 193 | # only copysign and log1p use <[ ]> markup in descr, |
---|
| 194 | # only gets() uses << >> markup |
---|
| 195 | # but we should handle it correctly |
---|
| 196 | descr = line_markup_convert(', '.join(descrlist)) |
---|
| 197 | |
---|
| 198 | # fpclassify includes an 'and' we need to discard |
---|
| 199 | namelist = map(lambda v: re.sub('^and ', '', v.strip(), 1), namelist) |
---|
| 200 | # strip off << >> surrounding name |
---|
| 201 | namelist = map(lambda v: v.strip().lstrip('<').rstrip('>'), namelist) |
---|
| 202 | |
---|
| 203 | if verbose: |
---|
| 204 | print(namelist, file=sys.stderr) |
---|
| 205 | # additional alternate names may also appear in INDEX commands |
---|
| 206 | |
---|
| 207 | # create the root element if needed |
---|
| 208 | if rootelement is None: |
---|
| 209 | rootelement = lxml.etree.Element('refentrycontainer') |
---|
| 210 | |
---|
| 211 | # FUNCTION implies starting a new refentry |
---|
| 212 | if refentry is not None: |
---|
| 213 | print("multiple FUNCTIONs without NEWPAGE", file=sys.stderr) |
---|
| 214 | exit(1) |
---|
| 215 | |
---|
| 216 | # create the refentry |
---|
| 217 | refentry = lxml.etree.SubElement(rootelement, 'refentry') |
---|
| 218 | refentry.append(lxml.etree.Comment(' Generated by makedocbook.py ')) |
---|
| 219 | refentry.set('id', namelist[0].lstrip('_')) |
---|
| 220 | |
---|
| 221 | refmeta = lxml.etree.SubElement(refentry, 'refmeta') |
---|
| 222 | # refentrytitle will be same as refdescriptor, the primary name |
---|
| 223 | refentrytitle = lxml.etree.SubElement(refmeta, 'refentrytitle') |
---|
| 224 | refentrytitle.text = namelist[0] |
---|
| 225 | manvolnum = lxml.etree.SubElement(refmeta, 'manvolnum') |
---|
| 226 | manvolnum.text = '3' |
---|
| 227 | |
---|
| 228 | refnamediv = lxml.etree.SubElement(refentry, 'refnamediv') |
---|
| 229 | # refdescriptor is the primary name, assume we should use the one which |
---|
| 230 | # appears first in the list |
---|
| 231 | refdescriptor = lxml.etree.SubElement(refnamediv, 'refdescriptor') |
---|
| 232 | refdescriptor.text = namelist[0] |
---|
| 233 | # refname elements exist for all alternate names |
---|
| 234 | for n in namelist: |
---|
| 235 | refname = lxml.etree.SubElement(refnamediv, 'refname') |
---|
| 236 | refname.text = n |
---|
| 237 | refpurpose = lxml.etree.SubElement(refnamediv, 'refpurpose') |
---|
| 238 | refnamediv.replace(refpurpose, lxml.etree.fromstring('<refpurpose>' + descr + '</refpurpose>')) |
---|
| 239 | |
---|
| 240 | # Only FUNCTION currently exists, which implies that the SYNOPSIS should be |
---|
| 241 | # a funcsynopsis. If TYPEDEF was to be added, SYNOPSIS should be processed |
---|
| 242 | # in a different way, probably producing a refsynopsis. |
---|
| 243 | |
---|
| 244 | # INDEX |
---|
| 245 | # may occur more than once for each FUNCTION giving alternate names this |
---|
| 246 | # function should be indexed under |
---|
| 247 | # |
---|
| 248 | def index(c, l): |
---|
| 249 | l = l.strip() |
---|
| 250 | |
---|
| 251 | if verbose: |
---|
| 252 | print('INDEX %s' % l, file=sys.stderr) |
---|
| 253 | |
---|
| 254 | # discard anything after the first word |
---|
| 255 | l = l.split()[0] |
---|
| 256 | |
---|
| 257 | # add indexterm |
---|
| 258 | # (we could just index under all the refnames, but we control the indexing |
---|
| 259 | # separately as that is what makedoc does) |
---|
| 260 | indexterm = lxml.etree.SubElement(refentry, 'indexterm') |
---|
| 261 | primary = lxml.etree.SubElement(indexterm, 'primary') |
---|
| 262 | primary.text = l |
---|
| 263 | |
---|
| 264 | # to validate, it seems we need to maintain refentry elements in a certain order |
---|
| 265 | refentry[:] = sorted(refentry, key = lambda x: x.tag) |
---|
| 266 | |
---|
| 267 | # adds another alternate refname |
---|
| 268 | refnamediv = refentry.find('refnamediv') |
---|
| 269 | |
---|
| 270 | # as long as it doesn't already exist |
---|
| 271 | if not refnamediv.xpath(('refname[.="%s"]') % l): |
---|
| 272 | refname = lxml.etree.SubElement(refnamediv, 'refname') |
---|
| 273 | refname.text = l |
---|
| 274 | if verbose > 1: |
---|
| 275 | print('added refname %s' % l, file=sys.stderr) |
---|
| 276 | else: |
---|
| 277 | if verbose > 1: |
---|
| 278 | print('duplicate refname %s discarded' % l, file=sys.stderr) |
---|
| 279 | |
---|
| 280 | # to validate, it seems we need to maintain refnamediv elements in a certain order |
---|
| 281 | refnamediv[:] = sorted(refnamediv, key = lambda x: x.tag) |
---|
| 282 | |
---|
| 283 | |
---|
| 284 | # SYNOPSIS aka ANSI_SYNOPSIS |
---|
| 285 | # ANSI-style synopsis |
---|
| 286 | # |
---|
| 287 | # Note that makedoc would also process <<code>> markup here, but there are no |
---|
| 288 | # such uses. |
---|
| 289 | # |
---|
| 290 | def synopsis(c, t): |
---|
| 291 | refsynopsisdiv = lxml.etree.SubElement(refentry, 'refsynopsisdiv') |
---|
| 292 | funcsynopsis = lxml.etree.SubElement(refsynopsisdiv, 'funcsynopsis') |
---|
| 293 | |
---|
| 294 | s = '' |
---|
| 295 | for l in t.splitlines(): |
---|
| 296 | if re.match('\s*(#|\[|struct)', l): |
---|
| 297 | # preprocessor # directives, structs, comments in square brackets |
---|
| 298 | funcsynopsisinfo = lxml.etree.SubElement(funcsynopsis, 'funcsynopsisinfo') |
---|
| 299 | funcsynopsisinfo.text = l.strip() + '\n' |
---|
| 300 | else: |
---|
| 301 | s = s + l |
---|
| 302 | |
---|
| 303 | # a prototype without a terminating ';' is an error |
---|
| 304 | if s.endswith(')'): |
---|
| 305 | print("'%s' missing terminating semicolon" % l, file=sys.stderr) |
---|
| 306 | s = s + ';' |
---|
| 307 | exit(1) |
---|
| 308 | |
---|
| 309 | if ';' in s: |
---|
| 310 | synopsis_for_prototype(funcsynopsis, s) |
---|
| 311 | s = '' |
---|
| 312 | |
---|
| 313 | if s.strip(): |
---|
| 314 | print("surplus synopsis '%s'" % s, file=sys.stderr) |
---|
| 315 | raise |
---|
| 316 | |
---|
| 317 | def synopsis_for_prototype(funcsynopsis, s): |
---|
| 318 | s = s.strip() |
---|
| 319 | |
---|
| 320 | # funcsynopsis has a very detailed content model, so we need to massage the |
---|
| 321 | # bare prototype into it. Fortunately, since the parameter names are marked |
---|
| 322 | # up, we have enough information to do this. |
---|
| 323 | for fp in s.split(';'): |
---|
| 324 | fp = fp.strip() |
---|
| 325 | if fp: |
---|
| 326 | |
---|
| 327 | if verbose: |
---|
| 328 | print("'%s'" % fp, file=sys.stderr) |
---|
| 329 | |
---|
| 330 | match = re.match(r'(.*?)([\w\d]*) ?\((.*)\)', fp) |
---|
| 331 | |
---|
| 332 | if verbose: |
---|
| 333 | print(match.groups(), file=sys.stderr) |
---|
| 334 | |
---|
| 335 | funcprototype = lxml.etree.SubElement(funcsynopsis, 'funcprototype') |
---|
| 336 | funcdef = lxml.etree.SubElement(funcprototype, 'funcdef') |
---|
| 337 | funcdef.text = match.group(1) |
---|
| 338 | function = lxml.etree.SubElement(funcdef, 'function') |
---|
| 339 | function.text = match.group(2) |
---|
| 340 | |
---|
| 341 | if match.group(3).strip() == 'void': |
---|
| 342 | void = lxml.etree.SubElement(funcprototype, 'void') |
---|
| 343 | else: |
---|
| 344 | # Split parameters on ',' except if it is inside () |
---|
| 345 | for p in re.split(',(?![^()]*\))', match.group(3)): |
---|
| 346 | p = p.strip() |
---|
| 347 | |
---|
| 348 | if verbose: |
---|
| 349 | print(p, file=sys.stderr) |
---|
| 350 | |
---|
| 351 | if p == '...': |
---|
| 352 | varargs = lxml.etree.SubElement(funcprototype, 'varargs') |
---|
| 353 | else: |
---|
| 354 | paramdef = lxml.etree.SubElement(funcprototype, 'paramdef') |
---|
| 355 | parameter = lxml.etree.SubElement(paramdef, 'parameter') |
---|
| 356 | |
---|
| 357 | # <[ ]> enclose the parameter name |
---|
| 358 | match2 = re.match('(.*)<\[(.*)\]>(.*)', p) |
---|
| 359 | |
---|
| 360 | if verbose: |
---|
| 361 | print(match2.groups(), file=sys.stderr) |
---|
| 362 | |
---|
| 363 | paramdef.text = match2.group(1) |
---|
| 364 | parameter.text = match2.group(2) |
---|
| 365 | parameter.tail = match2.group(3) |
---|
| 366 | |
---|
| 367 | |
---|
| 368 | # DESCRIPTION |
---|
| 369 | # (RETURNS, ERRORS, PORTABILITY, BUGS, WARNINGS, SEEALSO, NOTES are handled the same) |
---|
| 370 | # |
---|
| 371 | # Create a refsect with a title corresponding to the command |
---|
| 372 | # |
---|
| 373 | # Nearly all the the existing DESCRIPTION contents could be transformed into |
---|
| 374 | # DocBook with a few regex substitutions. Unfortunately, pages like sprintf and |
---|
| 375 | # sscanf, have very complex layout using nested tables and itemized lists, which |
---|
| 376 | # it is best to parse in order to transform correctly. |
---|
| 377 | # |
---|
| 378 | |
---|
| 379 | def refsect(t, s): |
---|
| 380 | refsect = lxml.etree.SubElement(refentry, 'refsect1') |
---|
| 381 | title = lxml.etree.SubElement(refsect, 'title') |
---|
| 382 | title.text = t.title() |
---|
| 383 | |
---|
| 384 | if verbose: |
---|
| 385 | print('%s has %d paragraphs' % (t, len(s.split('\n\n'))) , file=sys.stderr) |
---|
| 386 | |
---|
| 387 | if verbose > 1: |
---|
| 388 | dump(s, 'before lexing') |
---|
| 389 | |
---|
| 390 | # dump out lexer token sequence |
---|
| 391 | lex.input(s) |
---|
| 392 | for tok in lexer: |
---|
| 393 | print(tok, file=sys.stderr) |
---|
| 394 | |
---|
| 395 | # parse the section text for makedoc markup and the few pieces of texinfo |
---|
| 396 | # markup we understand, and output an XML marked-up string |
---|
| 397 | xml = parser.parse(s, tracking=True, debug=(verbose > 2)) |
---|
| 398 | |
---|
| 399 | dump(xml, 'after parsing') |
---|
| 400 | |
---|
| 401 | xml = '<refsect1>' + xml + '</refsect1>' |
---|
| 402 | |
---|
| 403 | refsect.extend(lxml.etree.fromstring(xml)) |
---|
| 404 | |
---|
| 405 | def seealso(c, t): |
---|
| 406 | refsect('SEE ALSO', t) |
---|
| 407 | |
---|
| 408 | # NEWPAGE |
---|
| 409 | # |
---|
| 410 | # start a new refentry |
---|
| 411 | |
---|
| 412 | def newpage(c, t): |
---|
| 413 | global refentry |
---|
| 414 | refentry = None |
---|
| 415 | |
---|
| 416 | # command dispatch table |
---|
| 417 | |
---|
| 418 | def discarded(c, t): |
---|
| 419 | return |
---|
| 420 | |
---|
| 421 | command_dispatch_dict = { |
---|
| 422 | 'FUNCTION' : function, |
---|
| 423 | 'TYPEDEF' : function, # TYPEDEF is not currently used, but described in doc.str |
---|
| 424 | 'INDEX' : index, |
---|
| 425 | 'TRAD_SYNOPSIS' : discarded, # K&R-style synopsis, obsolete and discarded |
---|
| 426 | 'ANSI_SYNOPSIS' : synopsis, |
---|
| 427 | 'SYNOPSIS' : synopsis, |
---|
| 428 | 'DESCRIPTION' : refsect, |
---|
| 429 | 'RETURNS' : refsect, |
---|
| 430 | 'ERRORS' : refsect, |
---|
| 431 | 'PORTABILITY' : refsect, |
---|
| 432 | 'BUGS' : refsect, |
---|
| 433 | 'WARNINGS' : refsect, |
---|
| 434 | 'SEEALSO' : seealso, |
---|
| 435 | 'NOTES' : refsect, # NOTES is not described in doc.str, so is currently discarded by makedoc, but that doesn't seem right |
---|
| 436 | 'QUICKREF' : discarded, # The intent of QUICKREF and MATHREF is not obvious, but they don't generate any output currently |
---|
| 437 | 'MATHREF' : discarded, |
---|
| 438 | 'START' : discarded, # a START command is inserted to contain the text before the first command |
---|
| 439 | 'END' : discarded, # an END command is inserted merely to terminate the text for the last command in a comment block |
---|
| 440 | 'NEWPAGE' : newpage, |
---|
| 441 | } |
---|
| 442 | |
---|
| 443 | # |
---|
| 444 | # Utility functions |
---|
| 445 | # |
---|
| 446 | |
---|
| 447 | # apply transformations which are easy to do in-place |
---|
| 448 | def line_markup_convert(p): |
---|
| 449 | s = p; |
---|
| 450 | |
---|
| 451 | # process the texinfo escape for an @ |
---|
| 452 | s = s.replace('@@', '@') |
---|
| 453 | |
---|
| 454 | # escape characters not allowed in XML |
---|
| 455 | s = s.replace('&','&') |
---|
| 456 | s = s.replace('<','<') |
---|
| 457 | s = s.replace('>','>') |
---|
| 458 | |
---|
| 459 | # convert <<somecode>> to <code>somecode</code> and <[var]> to |
---|
| 460 | # <varname>var</varname> |
---|
| 461 | # also handle nested << <[ ]> >> correctly |
---|
| 462 | s = s.replace('<<','<code>') |
---|
| 463 | s = s.replace('<[','<varname>') |
---|
| 464 | s = s.replace(']>','</varname>') |
---|
| 465 | s = s.replace('>>','</code>') |
---|
| 466 | |
---|
| 467 | # also convert some simple texinfo markup |
---|
| 468 | # convert @emph{foo} to <emphasis>foo</emphasis> |
---|
| 469 | s = re.sub('@emph{(.*?)}', '<emphasis>\\1</emphasis>', s) |
---|
| 470 | # convert @strong{foo} to <emphasis role=strong>foo</emphasis> |
---|
| 471 | s = re.sub('@strong{(.*?)}', '<emphasis role="strong">\\1</emphasis>', s) |
---|
| 472 | # convert @minus{} to U+2212 MINUS SIGN |
---|
| 473 | s = s.replace('@minus{}', '−') |
---|
| 474 | # convert @dots{} to U+2026 HORIZONTAL ELLIPSIS |
---|
| 475 | s = s.replace('@dots{}', '…') |
---|
| 476 | |
---|
| 477 | # convert xref and pxref |
---|
| 478 | s = re.sub('@xref{(.*?)}', "See <xref linkend='\\1'/>", s) |
---|
| 479 | |
---|
| 480 | # very hacky way of dealing with @* to force a newline |
---|
| 481 | s = s.replace('@*', '</para><para>') |
---|
| 482 | |
---|
| 483 | if (verbose > 3) and (s != p): |
---|
| 484 | print('%s-> line_markup_convert ->\n%s' % (p, s), file=sys.stderr) |
---|
| 485 | |
---|
| 486 | return s |
---|
| 487 | |
---|
| 488 | # |
---|
| 489 | # lexer |
---|
| 490 | # |
---|
| 491 | |
---|
| 492 | texinfo_commands = { |
---|
| 493 | 'ifnottex' : 'IFNOTTEX', |
---|
| 494 | 'end ifnottex' : 'ENDIFNOTTEX', |
---|
| 495 | 'tex' : 'IFTEX', |
---|
| 496 | 'end tex' : 'ENDIFTEX', |
---|
| 497 | 'comment' : 'COMMENT', |
---|
| 498 | 'c ' : 'COMMENT', |
---|
| 499 | 'multitable' : 'MULTICOLUMNTABLE', |
---|
| 500 | 'end multitable' : 'ENDMULTICOLUMNTABLE', |
---|
| 501 | 'headitem' : 'MCT_HEADITEM', |
---|
| 502 | 'tab' : 'MCT_COLUMN_SEPARATOR', |
---|
| 503 | 'item' : 'MCT_ITEM', |
---|
| 504 | } |
---|
| 505 | |
---|
| 506 | # token names |
---|
| 507 | tokens = [ |
---|
| 508 | 'BLANKLINE', |
---|
| 509 | 'BULLETEND', |
---|
| 510 | 'BULLETSTART', |
---|
| 511 | 'COURIER', |
---|
| 512 | 'EOF', |
---|
| 513 | 'ITEM', |
---|
| 514 | 'TABLEEND', |
---|
| 515 | 'TABLESTART', |
---|
| 516 | 'TEXINFO', |
---|
| 517 | 'TEXT', |
---|
| 518 | ] + list(set(texinfo_commands.values())) |
---|
| 519 | |
---|
| 520 | # regular expression rules for tokens, in priority order |
---|
| 521 | # (all these expressions should match a whole line) |
---|
| 522 | def t_TEXINFO(t): |
---|
| 523 | # this matches any @command. but not @command{} which just happens to be at |
---|
| 524 | # the start of a line |
---|
| 525 | r'@\w+[^{]*?\n' |
---|
| 526 | |
---|
| 527 | # if the line starts with a known texinfo command, change t.type to the |
---|
| 528 | # token for that command |
---|
| 529 | for k in texinfo_commands.keys(): |
---|
| 530 | if t.value[1:].startswith(k): |
---|
| 531 | t.type = texinfo_commands[k] |
---|
| 532 | break |
---|
| 533 | |
---|
| 534 | return t |
---|
| 535 | |
---|
| 536 | def t_COURIER(t): |
---|
| 537 | r'[.|].*\n' |
---|
| 538 | t.value = line_markup_convert(t.value[1:]) |
---|
| 539 | return t |
---|
| 540 | |
---|
| 541 | def t_BULLETSTART(t): |
---|
| 542 | r'O\+\n' |
---|
| 543 | return t |
---|
| 544 | |
---|
| 545 | def t_BULLETEND(t): |
---|
| 546 | r'O-\n' |
---|
| 547 | return t |
---|
| 548 | |
---|
| 549 | def t_TABLESTART(t): |
---|
| 550 | r'o\+\n' |
---|
| 551 | return t |
---|
| 552 | |
---|
| 553 | def t_TABLEEND(t): |
---|
| 554 | r'o-\n' |
---|
| 555 | return t |
---|
| 556 | |
---|
| 557 | def t_ITEM(t): |
---|
| 558 | r'o\s.*\n' |
---|
| 559 | t.value = re.sub('o\s', '', lexer.lexmatch.group(0), 1) |
---|
| 560 | t.value = line_markup_convert(t.value) |
---|
| 561 | return t |
---|
| 562 | |
---|
| 563 | def t_TEXT(t): |
---|
| 564 | r'.+\n' |
---|
| 565 | t.value = line_markup_convert(t.value) |
---|
| 566 | t.lexer.lineno += 1 |
---|
| 567 | return t |
---|
| 568 | |
---|
| 569 | def t_BLANKLINE(t): |
---|
| 570 | r'\n' |
---|
| 571 | t.lexer.lineno += 1 |
---|
| 572 | return t |
---|
| 573 | |
---|
| 574 | def t_eof(t): |
---|
| 575 | if hasattr(t.lexer,'at_eof'): |
---|
| 576 | # remove eof flag ready for lexing next input |
---|
| 577 | delattr(t.lexer,'at_eof') |
---|
| 578 | t.lexer.lineno = 0 |
---|
| 579 | return None |
---|
| 580 | |
---|
| 581 | t.type = 'EOF' |
---|
| 582 | t.lexer.at_eof = True; |
---|
| 583 | |
---|
| 584 | return t |
---|
| 585 | |
---|
| 586 | # Error handling rule |
---|
| 587 | def t_error(t): |
---|
| 588 | print("tokenization error, remaining text '%s'" % t.value, file=sys.stderr) |
---|
| 589 | exit(1) |
---|
| 590 | |
---|
| 591 | lexer = lex.lex() |
---|
| 592 | |
---|
| 593 | # |
---|
| 594 | # parser |
---|
| 595 | # |
---|
| 596 | |
---|
| 597 | def parser_verbose(p): |
---|
| 598 | if verbose > 2: |
---|
| 599 | print(p[0], file=sys.stderr) |
---|
| 600 | |
---|
| 601 | def p_input(p): |
---|
| 602 | '''input : paragraph |
---|
| 603 | | input paragraph''' |
---|
| 604 | if len(p) == 3: |
---|
| 605 | p[0] = p[1] + '\n' + p[2] |
---|
| 606 | else: |
---|
| 607 | p[0] = p[1] |
---|
| 608 | parser_verbose(p) |
---|
| 609 | |
---|
| 610 | # Strictly, text at top level should be paragraphs (i.e terminated by a |
---|
| 611 | # BLANKLINE), while text contained in rows or bullets may not be, but this |
---|
| 612 | # grammar doesn't enforce that for simplicity's sake. |
---|
| 613 | def p_paragraph(p): |
---|
| 614 | '''paragraph : paragraph_content maybe_eof_or_blankline''' |
---|
| 615 | p[0] = '<para>\n' + p[1] + '</para>' |
---|
| 616 | parser_verbose(p) |
---|
| 617 | |
---|
| 618 | def p_paragraph_content(p): |
---|
| 619 | '''paragraph_content : paragraph_line |
---|
| 620 | | paragraph_line paragraph_content''' |
---|
| 621 | if len(p) == 3: |
---|
| 622 | p[0] = p[1] + p[2] |
---|
| 623 | else: |
---|
| 624 | p[0] = p[1] |
---|
| 625 | parser_verbose(p) |
---|
| 626 | |
---|
| 627 | def p_paragraph_line(p): |
---|
| 628 | '''paragraph_line : TEXT |
---|
| 629 | | texinfocmd |
---|
| 630 | | courierblock |
---|
| 631 | | table |
---|
| 632 | | bulletlist''' |
---|
| 633 | p[0] = p[1] |
---|
| 634 | |
---|
| 635 | def p_empty(p): |
---|
| 636 | 'empty :' |
---|
| 637 | p[0] = '' |
---|
| 638 | |
---|
| 639 | def p_maybe_eof_or_blankline(p): |
---|
| 640 | '''maybe_eof_or_blankline : empty |
---|
| 641 | | EOF |
---|
| 642 | | BLANKLINE |
---|
| 643 | | BLANKLINE EOF''' |
---|
| 644 | p[0] = '' |
---|
| 645 | |
---|
| 646 | def p_maybe_lines(p): |
---|
| 647 | '''maybe_lines : empty |
---|
| 648 | | paragraph maybe_lines''' |
---|
| 649 | if len(p) == 3: |
---|
| 650 | p[0] = p[1] + p[2] |
---|
| 651 | else: |
---|
| 652 | p[0] = p[1] |
---|
| 653 | parser_verbose(p) |
---|
| 654 | |
---|
| 655 | def p_maybe_blankline(p): |
---|
| 656 | '''maybe_blankline : empty |
---|
| 657 | | BLANKLINE''' |
---|
| 658 | p[0] = '' |
---|
| 659 | |
---|
| 660 | def p_courierblock(p): |
---|
| 661 | '''courierblock : courier''' |
---|
| 662 | p[0] = '<literallayout class="monospaced">' + p[1] + '</literallayout>' |
---|
| 663 | parser_verbose(p) |
---|
| 664 | |
---|
| 665 | def p_courier(p): |
---|
| 666 | '''courier : COURIER |
---|
| 667 | | COURIER courier''' |
---|
| 668 | if len(p) == 3: |
---|
| 669 | p[0] = p[1] + p[2] |
---|
| 670 | else: |
---|
| 671 | p[0] = p[1] |
---|
| 672 | parser_verbose(p) |
---|
| 673 | |
---|
| 674 | def p_bullet(p): |
---|
| 675 | '''bullet : ITEM maybe_lines |
---|
| 676 | | ITEM BLANKLINE maybe_lines''' |
---|
| 677 | if len(p) == 3: |
---|
| 678 | # Glue any text in ITEM into the first para of maybe_lines |
---|
| 679 | # (This is an unfortunate consequence of the line-based tokenization we do) |
---|
| 680 | if p[2].startswith('<para>'): |
---|
| 681 | p[0] = '<listitem><para>' + p[1] + p[2][len('<para>'):] + '</listitem>' |
---|
| 682 | else: |
---|
| 683 | p[0] = '<listitem><para>' + p[1] + '</para>' + p[2] + '</listitem>' |
---|
| 684 | else: |
---|
| 685 | p[0] = '<listitem><para>' + p[1] + '</para>' + p[3] + '</listitem>' |
---|
| 686 | parser_verbose(p) |
---|
| 687 | |
---|
| 688 | def p_bullets(p): |
---|
| 689 | '''bullets : bullet |
---|
| 690 | | bullet bullets''' |
---|
| 691 | if len(p) == 3: |
---|
| 692 | p[0] = p[1] + '\n' + p[2] |
---|
| 693 | else: |
---|
| 694 | p[0] = p[1] |
---|
| 695 | parser_verbose(p) |
---|
| 696 | |
---|
| 697 | def p_bulletlist(p): |
---|
| 698 | '''bulletlist : BULLETSTART bullets BULLETEND maybe_blankline''' |
---|
| 699 | p[0] = '<itemizedlist>' + p[2] + '</itemizedlist>' |
---|
| 700 | parser_verbose(p) |
---|
| 701 | |
---|
| 702 | def p_row(p): |
---|
| 703 | '''row : ITEM maybe_lines |
---|
| 704 | | ITEM BLANKLINE maybe_lines''' |
---|
| 705 | if len(p) == 3: |
---|
| 706 | p[0] = '<row><entry><code>' + p[1] + '</code></entry><entry>' + p[2] + '</entry></row>' |
---|
| 707 | else: |
---|
| 708 | p[0] = '<row><entry><code>' + p[1] + '</code></entry><entry>' + p[3] + '</entry></row>' |
---|
| 709 | parser_verbose(p) |
---|
| 710 | |
---|
| 711 | def p_rows(p): |
---|
| 712 | '''rows : row |
---|
| 713 | | row rows''' |
---|
| 714 | if len(p) == 3: |
---|
| 715 | p[0] = p[1] + '\n' + p[2] |
---|
| 716 | else: |
---|
| 717 | p[0] = p[1] |
---|
| 718 | parser_verbose(p) |
---|
| 719 | |
---|
| 720 | def p_table(p): |
---|
| 721 | '''table : TABLESTART rows TABLEEND maybe_blankline''' |
---|
| 722 | p[0] = '<informaltable><tgroup cols="2"><tbody>' + p[2] + '</tbody></tgroup></informaltable>' |
---|
| 723 | parser_verbose(p) |
---|
| 724 | |
---|
| 725 | def p_texinfocmd(p): |
---|
| 726 | '''texinfocmd : unknown_texinfocmd |
---|
| 727 | | comment |
---|
| 728 | | multitable |
---|
| 729 | | nottex |
---|
| 730 | | tex''' |
---|
| 731 | p[0] = p[1] |
---|
| 732 | |
---|
| 733 | def p_unknown_texinfocmd(p): |
---|
| 734 | '''unknown_texinfocmd : TEXINFO''' |
---|
| 735 | print("unknown texinfo command '%s'" % p[1].strip(), file=sys.stderr) |
---|
| 736 | p[0] = p[1] |
---|
| 737 | parser_verbose(p) |
---|
| 738 | |
---|
| 739 | def p_nottex(p): |
---|
| 740 | '''nottex : IFNOTTEX paragraph_content ENDIFNOTTEX''' |
---|
| 741 | p[0] = p[2] |
---|
| 742 | |
---|
| 743 | def p_tex(p): |
---|
| 744 | '''tex : IFTEX paragraph_content ENDIFTEX''' |
---|
| 745 | # text for TeX formatter inside @iftex is discarded |
---|
| 746 | p[0] = '' |
---|
| 747 | |
---|
| 748 | def p_comment(p): |
---|
| 749 | '''comment : COMMENT''' |
---|
| 750 | # comment text is discarded |
---|
| 751 | p[0] = '' |
---|
| 752 | |
---|
| 753 | def p_mct_columns(p): |
---|
| 754 | '''mct_columns : maybe_lines |
---|
| 755 | | maybe_lines MCT_COLUMN_SEPARATOR mct_columns''' |
---|
| 756 | if len(p) == 4: |
---|
| 757 | p[0] = '<entry>' + p[1] + '</entry>' + p[3] |
---|
| 758 | else: |
---|
| 759 | p[0] = '<entry>' + p[1] + '</entry>' |
---|
| 760 | parser_verbose(p) |
---|
| 761 | |
---|
| 762 | def p_mct_row(p): |
---|
| 763 | '''mct_row : MCT_ITEM mct_columns''' |
---|
| 764 | p[0] = '<row>' + p[2] + '</row>' |
---|
| 765 | parser_verbose(p) |
---|
| 766 | |
---|
| 767 | def p_mct_rows(p): |
---|
| 768 | '''mct_rows : mct_row |
---|
| 769 | | mct_row mct_rows''' |
---|
| 770 | if len(p) == 3: |
---|
| 771 | p[0] = p[1] + '\n' + p[2] |
---|
| 772 | else: |
---|
| 773 | p[0] = p[1] |
---|
| 774 | parser_verbose(p) |
---|
| 775 | |
---|
| 776 | def p_mct_header(p): |
---|
| 777 | '''mct_header : MCT_HEADITEM mct_columns''' |
---|
| 778 | p[0] = '<row>' + p[2] + '</row>' |
---|
| 779 | parser_verbose(p) |
---|
| 780 | |
---|
| 781 | def p_multitable(p): |
---|
| 782 | '''multitable : MULTICOLUMNTABLE mct_header mct_rows ENDMULTICOLUMNTABLE''' |
---|
| 783 | # this doesn't handle the prototype row form of @multitable, only the @columnfractions form |
---|
| 784 | colfrac = p[1].replace('@multitable @columnfractions', '').split() |
---|
| 785 | colspec = '\n'.join(['<colspec colwidth="%s*"/>' % (c) for c in colfrac]) |
---|
| 786 | header = '<thead>' + p[2] + '</thead>\n' |
---|
| 787 | body = '<tbody>' + p[3] + '</tbody>\n' |
---|
| 788 | p[0] = '<informaltable><tgroup cols="' + str(len(colfrac)) +'">' + colspec + header + body + '</tgroup></informaltable>' |
---|
| 789 | parser_verbose(p) |
---|
| 790 | |
---|
| 791 | def p_error(t): |
---|
| 792 | print('parse error at line %d, token %s, next token %s' % (t.lineno, t, parser.token()), file=sys.stderr) |
---|
| 793 | exit(1) |
---|
| 794 | |
---|
| 795 | parser = yacc.yacc(start='input') |
---|
| 796 | |
---|
| 797 | # |
---|
| 798 | # |
---|
| 799 | # |
---|
| 800 | |
---|
| 801 | def main(file): |
---|
| 802 | content = file.read() |
---|
| 803 | content = remove_noncomments(content) |
---|
| 804 | processed = process(content) |
---|
| 805 | perform(processed) |
---|
| 806 | |
---|
| 807 | # output the XML tree |
---|
| 808 | s = lxml.etree.tostring(rootelement, pretty_print=True) |
---|
| 809 | |
---|
| 810 | if not s: |
---|
| 811 | print('No output produced (perhaps the input has no makedoc markup?)', file=sys.stderr) |
---|
| 812 | exit(1) |
---|
| 813 | |
---|
| 814 | print(s) |
---|
| 815 | |
---|
| 816 | # warn about texinfo commands which didn't get processed |
---|
| 817 | match = re.search('@[a-z*]+', s) |
---|
| 818 | if match: |
---|
| 819 | print('texinfo command %s remains in output' % match.group(), file=sys.stderr) |
---|
| 820 | |
---|
| 821 | # |
---|
| 822 | # |
---|
| 823 | # |
---|
| 824 | |
---|
| 825 | if __name__ == '__main__' : |
---|
| 826 | options = OptionParser() |
---|
| 827 | options.add_option('-v', '--verbose', action='count', dest = 'verbose') |
---|
| 828 | options.add_option('-c', '--cache', action='store_true', dest = 'cache', help="just ensure PLY cache is up to date") |
---|
| 829 | (opts, args) = options.parse_args() |
---|
| 830 | |
---|
| 831 | if opts.cache: |
---|
| 832 | sys.exit() |
---|
| 833 | |
---|
| 834 | verbose = opts.verbose |
---|
| 835 | |
---|
| 836 | if len(args) > 0: |
---|
| 837 | main(open(args[0], 'rb')) |
---|
| 838 | else: |
---|
| 839 | main(sys.stdin) |
---|