# :Id: $Id: latex2mathml.py 9338 2023-04-08 21:08:47Z milde $ # :Copyright: © 2005 Jens Jørgen Mortensen [1]_ # © 2010, 2021 Günter Milde. # # :License: Released under the terms of the `2-Clause BSD license`_, in short: # # Copying and distribution of this file, with or without modification, # are permitted in any medium without royalty provided the copyright # notice and this notice are preserved. # This file is offered as-is, without any warranty. # # .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause # # .. [1] the original `rst2mathml.py` in `sandbox/jensj/latex_math` """Convert LaTex maths code into presentational MathML. This module is provisional: the API is not settled and may change with any minor Docutils version. """ # Usage: # # >>> from latex2mathml import * import re import unicodedata from docutils.utils.math import tex2unichar, toplevel_code # Character data # -------------- # LaTeX math macro to Unicode mappings. # Character categories. # identifiers -> letters = tex2unichar.mathalpha letters['hbar'] = '\u210F' # compatibility mapping to ℏ (\hslash). # (ħ LATIN SMALL LETTER H WITH STROKE is upright) # special case: Capital Greek letters: (upright in TeX style) greek_capitals = { 'Phi': '\u03a6', 'Xi': '\u039e', 'Sigma': '\u03a3', 'Psi': '\u03a8', 'Delta': '\u0394', 'Theta': '\u0398', 'Upsilon': '\u03d2', 'Pi': '\u03a0', 'Omega': '\u03a9', 'Gamma': '\u0393', 'Lambda': '\u039b'} # functions -> functions = { # functions with a space in the name 'liminf': 'lim\u202finf', 'limsup': 'lim\u202fsup', 'injlim': 'inj\u202flim', 'projlim': 'proj\u202flim', # embellished function names (see handle_cmd() below) 'varlimsup': 'lim', 'varliminf': 'lim', 'varprojlim': 'lim', 'varinjlim': 'lim', # custom function name 'operatorname': None, } functions.update((name, name) for name in ('arccos', 'arcsin', 'arctan', 'arg', 'cos', 'cosh', 'cot', 'coth', 'csc', 'deg', 'det', 'dim', 'exp', 'gcd', 'hom', 'ker', 'lg', 'ln', 'log', 'Pr', 'sec', 'sin', 'sinh', 'tan', 'tanh')) # Function with limits: 'lim', 'sup', 'inf', 'max', 'min': # use to allow "movablelimits" attribute (see below). # modulo operator/arithmetic modulo_functions = { # cmdname: (binary, named, parentheses, padding) 'bmod': (True, True, False, '0.278em'), # a mod n 'pmod': (False, True, True, '0.444em'), # a (mod n) 'mod': (False, True, False, '0.667em'), # a mod n 'pod': (False, False, True, '0.444em'), # a (n) } # math font selection -> or math_alphabets = { # 'cmdname': 'mathvariant value' # package 'boldsymbol': 'bold', 'mathbf': 'bold', 'mathit': 'italic', 'mathtt': 'monospace', 'mathrm': 'normal', 'mathsf': 'sans-serif', 'mathcal': 'script', 'mathbfit': 'bold-italic', # isomath 'mathbb': 'double-struck', # amssymb 'mathfrak': 'fraktur', # amssymb 'mathsfit': 'sans-serif-italic', # isomath 'mathsfbfit': 'sans-serif-bold-italic', # isomath 'mathscr': 'script', # mathrsfs # unsupported: bold-fraktur # bold-script # bold-sans-serif } # operator, fence, or separator -> stretchables = { # extensible delimiters allowed in left/right cmds 'backslash': '\\', 'uparrow': '\u2191', # ↑ UPWARDS ARROW 'downarrow': '\u2193', # ↓ DOWNWARDS ARROW 'updownarrow': '\u2195', # ↕ UP DOWN ARROW 'Uparrow': '\u21d1', # ⇑ UPWARDS DOUBLE ARROW 'Downarrow': '\u21d3', # ⇓ DOWNWARDS DOUBLE ARROW 'Updownarrow': '\u21d5', # ⇕ UP DOWN DOUBLE ARROW 'lmoustache': '\u23b0', # ⎰ … CURLY BRACKET SECTION 'rmoustache': '\u23b1', # ⎱ … LEFT CURLY BRACKET SECTION 'arrowvert': '\u23d0', # ⏐ VERTICAL LINE EXTENSION 'bracevert': '\u23aa', # ⎪ CURLY BRACKET EXTENSION 'lvert': '|', # left | 'lVert': '\u2016', # left ‖ 'rvert': '|', # right | 'rVert': '\u2016', # right ‖ 'Arrowvert': '\u2016', # ‖ } stretchables.update(tex2unichar.mathfence) stretchables.update(tex2unichar.mathopen) # Braces stretchables.update(tex2unichar.mathclose) # Braces # >>> print(' '.join(sorted(set(stretchables.values())))) # [ \ ] { | } ‖ ↑ ↓ ↕ ⇑ ⇓ ⇕ ⌈ ⌉ ⌊ ⌋ ⌜ ⌝ ⌞ ⌟ ⎪ ⎰ ⎱ ⏐ ⟅ ⟆ ⟦ ⟧ ⟨ ⟩ ⟮ ⟯ ⦇ ⦈ operators = { # negated symbols without pre-composed Unicode character 'nleqq': '\u2266\u0338', # ≦̸ 'ngeqq': '\u2267\u0338', # ≧̸ 'nleqslant': '\u2a7d\u0338', # ⩽̸ 'ngeqslant': '\u2a7e\u0338', # ⩾̸ 'ngtrless': '\u2277\u0338', # txfonts 'nlessgtr': '\u2276\u0338', # txfonts 'nsubseteqq': '\u2AC5\u0338', # ⫅̸ 'nsupseteqq': '\u2AC6\u0338', # ⫆̸ # compatibility definitions: 'centerdot': '\u2B1D', # BLACK VERY SMALL SQUARE | mathbin 'varnothing': '\u2300', # ⌀ DIAMETER SIGN | empty set 'varpropto': '\u221d', # ∝ PROPORTIONAL TO | sans serif 'triangle': '\u25B3', # WHITE UP-POINTING TRIANGLE | mathord 'triangledown': '\u25BD', # WHITE DOWN-POINTING TRIANGLE | mathord # alias commands: 'dotsb': '\u22ef', # ⋯ with binary operators/relations 'dotsc': '\u2026', # … with commas 'dotsi': '\u22ef', # ⋯ with integrals 'dotsm': '\u22ef', # ⋯ multiplication dots 'dotso': '\u2026', # … other dots # functions with movable limits (requires ) 'lim': 'lim', 'sup': 'sup', 'inf': 'inf', 'max': 'max', 'min': 'min', } operators.update(tex2unichar.mathbin) # Binary symbols operators.update(tex2unichar.mathrel) # Relation symbols, arrow symbols operators.update(tex2unichar.mathord) # Miscellaneous symbols operators.update(tex2unichar.mathpunct) # Punctuation operators.update(tex2unichar.mathop) # Variable-sized symbols operators.update(stretchables) # special cases thick_operators = { # style='font-weight: bold;' 'thicksim': '\u223C', # ∼ 'thickapprox': '\u2248', # ≈ } small_operators = { # mathsize='75%' 'shortmid': '\u2223', # ∣ 'shortparallel': '\u2225', # ∥ 'nshortmid': '\u2224', # ∤ 'nshortparallel': '\u2226', # ∦ 'smallfrown': '\u2322', # ⌢ FROWN 'smallsmile': '\u2323', # ⌣ SMILE 'smallint': '\u222b', # ∫ INTEGRAL } # Operators and functions with limits above/below in display formulas # and in index position inline (movablelimits=True) movablelimits = ('bigcap', 'bigcup', 'bigodot', 'bigoplus', 'bigotimes', 'bigsqcup', 'biguplus', 'bigvee', 'bigwedge', 'coprod', 'intop', 'ointop', 'prod', 'sum', 'lim', 'max', 'min', 'sup', 'inf') # Depending on settings, integrals may also be in this category. # (e.g. if "amsmath" is loaded with option "intlimits", see # http://mirror.ctan.org/macros/latex/required/amsmath/amsldoc.pdf) # movablelimits.extend(('fint', 'iiiint', 'iiint', 'iint', 'int', 'oiint', # 'oint', 'ointctrclockwise', 'sqint', # 'varointclockwise',)) # horizontal space -> spaces = {'qquad': '2em', # two \quad 'quad': '1em', # 18 mu 'thickspace': '0.2778em', # 5mu = 5/18em ';': '0.2778em', # 5mu thickspace ' ': '0.25em', # inter word space 'medspace': '0.2222em', # 4mu = 2/9em ':': '0.2222em', # 4mu medspace 'thinspace': '0.1667em', # 3mu = 1/6em ',': '0.1667em', # 3mu thinspace 'negthinspace': '-0.1667em', # -3mu = -1/6em '!': '-0.1667em', # negthinspace 'negmedspace': '-0.2222em', # -4mu = -2/9em 'negthickspace': '-0.2778em', # -5mu = -5/18em } # accents -> accents = { # TeX: (spacing, combining) 'acute': ('´', '\u0301'), 'bar': ('ˉ', '\u0304'), 'breve': ('˘', '\u0306'), 'check': ('ˇ', '\u030C'), 'dot': ('˙', '\u0307'), 'ddot': ('¨', '\u0308'), 'dddot': ('⋯', '\u20DB'), 'grave': ('`', '\u0300'), 'hat': ('ˆ', '\u0302'), 'mathring': ('˚', '\u030A'), 'tilde': ('˜', '\u0303'), # tilde ~ or small tilde ˜? 'vec': ('→', '\u20d7'), # → too heavy, accents="false" # TODO: ddddot } # limits etc. -> or over = { # TeX: (char, offset-correction/em) 'overbrace': ('\u23DE', -0.2), # DejaVu Math -0.6 'overleftarrow': ('\u2190', -0.2), 'overleftrightarrow': ('\u2194', -0.2), 'overline': ('_', -0.2), # \u2012 does not stretch 'overrightarrow': ('\u2192', -0.2), 'widehat': ('^', -0.5), 'widetilde': ('~', -0.3), } under = {'underbrace': ('\u23DF', 0.1), # DejaVu Math -0.7 'underleftarrow': ('\u2190', -0.2), 'underleftrightarrow': ('\u2194', -0.2), 'underline': ('_', -0.8), 'underrightarrow': ('\u2192', -0.2), } # Character translations # ---------------------- # characters with preferred alternative in mathematical use # cf. https://www.w3.org/TR/MathML3/chapter7.html#chars.anomalous anomalous_chars = {'-': '\u2212', # HYPHEN-MINUS -> MINUS SIGN ':': '\u2236', # COLON -> RATIO '~': '\u00a0', # NO-BREAK SPACE } # blackboard bold (Greek characters not working with "mathvariant" (Firefox 78) mathbb = {'Γ': '\u213E', # ℾ 'Π': '\u213F', # ℿ 'Σ': '\u2140', # ⅀ 'γ': '\u213D', # ℽ 'π': '\u213C', # ℼ } # Matrix environments matrices = { # name: fences 'matrix': ('', ''), 'smallmatrix': ('', ''), # smaller, see begin_environment()! 'pmatrix': ('(', ')'), 'bmatrix': ('[', ']'), 'Bmatrix': ('{', '}'), 'vmatrix': ('|', '|'), 'Vmatrix': ('\u2016', '\u2016'), # ‖ 'cases': ('{', ''), } layout_styles = { 'displaystyle': {'displaystyle': True, 'scriptlevel': 0}, 'textstyle': {'displaystyle': False, 'scriptlevel': 0}, 'scriptstyle': {'displaystyle': False, 'scriptlevel': 1}, 'scriptscriptstyle': {'displaystyle': False, 'scriptlevel': 2}, } # See also https://www.w3.org/TR/MathML3/chapter3.html#presm.scriptlevel fractions = { # name: style_attrs, frac_attrs 'frac': ({}, {}), 'cfrac': ({'displaystyle': True, 'scriptlevel': 0, 'CLASS': 'cfrac'}, {}), # in LaTeX with padding 'dfrac': (layout_styles['displaystyle'], {}), 'tfrac': (layout_styles['textstyle'], {}), 'binom': ({}, {'linethickness': 0}), 'dbinom': (layout_styles['displaystyle'], {'linethickness': 0}), 'tbinom': (layout_styles['textstyle'], {'linethickness': 0}), } delimiter_sizes = ['', '1.2em', '1.623em', '2.047em', '2.470em'] bigdelimiters = {'left': 0, 'right': 0, 'bigl': 1, 'bigr': 1, 'Bigl': 2, 'Bigr': 2, 'biggl': 3, 'biggr': 3, 'Biggl': 4, 'Biggr': 4, } # MathML element classes # ---------------------- class math: """Base class for MathML elements and root of MathML trees.""" nchildren = None """Expected number of children or None""" # cf. https://www.w3.org/TR/MathML3/chapter3.html#id.3.1.3.2 parent = None """Parent node in MathML DOM tree.""" _level = 0 # indentation level (static class variable) xml_entities = { # for invalid and invisible characters ord('<'): '<', ord('>'): '>', ord('&'): '&', 0x2061: '⁡', } _boolstrings = {True: 'true', False: 'false'} """String representation of boolean MathML attribute values.""" html_tagname = 'span' """Tag name for HTML representation.""" def __init__(self, *children, **attributes): """Set up node with `children` and `attributes`. Attributes are downcased: Use CLASS to set "class" value. >>> math(mn(3), CLASS='test') math(mn(3), class='test') >>> math(CLASS='test').toprettyxml() '\n' """ self.children = [] self.extend(children) self.attributes = {} for key in attributes.keys(): # Use .lower() to allow argument `CLASS` for attribute `class` # (Python keyword). MathML uses only lowercase attributes. self.attributes[key.lower()] = attributes[key] def __repr__(self): content = [repr(item) for item in getattr(self, 'children', [])] if hasattr(self, 'data'): content.append(repr(self.data)) if isinstance(self, MathSchema) and self.switch: content.append('switch=True') content += ["%s=%r"%(k, v) for k, v in self.attributes.items() if v is not None] return self.__class__.__name__ + '(%s)' % ', '.join(content) def __len__(self): return len(self.children) # emulate dictionary-like access to attributes # see `docutils.nodes.Element` for dict/list interface def __getitem__(self, key): return self.attributes[key] def __setitem__(self, key, item): self.attributes[key] = item def get(self, *args, **kwargs): return self.attributes.get(*args, **kwargs) def full(self): """Return boolean indicating whether children may be appended.""" return (self.nchildren is not None and len(self) >= self.nchildren) def append(self, child): """Append child and return self or first non-full parent. If self is full, go up the tree and return first non-full node or `None`. """ if self.full(): raise SyntaxError('Node %s already full!' % self) self.children.append(child) child.parent = self if self.full(): return self.close() return self def extend(self, children): for child in children: self.append(child) return self def close(self): """Close element and return first non-full parent or None.""" parent = self.parent while parent is not None and parent.full(): parent = parent.parent return parent def toprettyxml(self): """Return XML representation of self as string.""" return ''.join(self._xml()) def _xml(self, level=0): return ([self.xml_starttag()] + self._xml_body(level) + ['' % self.__class__.__name__]) def xml_starttag(self): attrs = ('%s="%s"' % (k, str(v).replace('True', 'true').replace('False', 'false')) for k, v in self.attributes.items() if v is not None) return '<%s>' % ' '.join((self.__class__.__name__, *attrs)) def _xml_body(self, level=0): xml = [] for child in self.children: xml.extend(['\n', ' ' * (level+1)]) xml.extend(child._xml(level+1)) xml.extend(['\n', ' ' * level]) return xml def is_block(self): """Return true, if `self` or a parent has ``display='block'``.""" try: return self['display'] == 'block' except KeyError: try: return self.parent.is_block() except AttributeError: return False # >>> n2 = math(mn(2)) # >>> n2 # math(mn(2)) # >>> n2.toprettyxml() # '\n 2\n' # >>> len(n2) # 1 # >>> eq3 = math(id='eq3', display='block') # >>> eq3 # math(id='eq3', display='block') # >>> eq3.toprettyxml() # '\n' # >>> len(eq3) # 0 # >>> math(CLASS='bold').xml_starttag() # '' # >>> n2.is_block() # False # >>> node = n2.append(mrow()) # >>> node.is_block() # False # >>> eq3.is_block() # True # >>> node = eq3.append(mrow()) # >>> node.is_block() # True class mtable(math): pass # >>> mt = mtable(displaystyle=True) # >>> mt # mtable(displaystyle=True) # >>> math(mt).toprettyxml() # '\n \n \n' class mrow(math): """Group sub-expressions as a horizontal row.""" def close(self): """Close element and return first non-full parent or None. Remove , if it is single child and the parent infers an mrow or if it has only one child element. """ parent = self.parent if isinstance(parent, MathRowSchema) and parent.nchildren == 1: parent.nchildren = len(parent.children) parent.children = self.children for child in self.children: child.parent = parent return parent.close() if len(self) == 1: try: parent.children[parent.children.index(self)] = self.children[0] self.children[0].parent = parent except (AttributeError, ValueError): return self.children[0] return super().close() # >>> mrow(displaystyle=False) # mrow(displaystyle=False) # The elements , , , , , , # , , and treat their contents as a single inferred mrow # formed from all their children. class MathRowSchema(math): """Base class for elements treating content as a single inferred mrow.""" class mtr(MathRowSchema): pass class mtd(MathRowSchema): pass class menclose(MathRowSchema): nchildren = 1 # \boxed expects one argument or a group class mphantom(MathRowSchema): nchildren = 1 # \phantom expects one argument or a group class msqrt(MathRowSchema): nchildren = 1 # \sqrt expects one argument or a group class mstyle(MathRowSchema): nchildren = 1 # \mathrm, ... expect one argument or a group class MathToken(math): """Token Element: contains textual data instead of children. Base class for mo, mi, and mn. """ nchildren = 0 def __init__(self, data, **attributes): self.data = data super().__init__(**attributes) def _xml_body(self, level=0): return [str(self.data).translate(self.xml_entities)] class mtext(MathToken): pass class mi(MathToken): pass class mo(MathToken): pass class mn(MathToken): pass # >>> mo('<') # mo('<') # >>> mo('<')._xml() # ['', '<', ''] class MathSchema(math): """Base class for schemata expecting 2 or more children. The special attribute `switch` indicates that the last two child elements are in reversed order and must be switched before XML-export. """ nchildren = 2 def __init__(self, *children, **kwargs): self.switch = kwargs.pop('switch', False) math.__init__(self, *children, **kwargs) def append(self, child): current_node = super().append(child) # normalize order if full if self.switch and self.full(): self.children[-1], self.children[-2] = self.children[-2], self.children[-1] self.switch = False return current_node class msub(MathSchema): pass class msup(MathSchema): pass class msubsup(MathSchema): nchildren = 3 # >>> msub(mi('x'), mo('-')) # msub(mi('x'), mo('-')) # >>> msubsup(mi('base'), mi('sub'), mi('super')) # msubsup(mi('base'), mi('sub'), mi('super')) # >>> msubsup(mi('base'), mi('super'), mi('sub'), switch=True) # msubsup(mi('base'), mi('sub'), mi('super')) class munder(msub): pass class mover(msup): pass # >>> munder(mi('lim'), mo('-'), accent=False) # munder(mi('lim'), mo('-'), accent=False) # >>> mu = munder(mo('-'), accent=False, switch=True) # >>> mu # munder(mo('-'), switch=True, accent=False) # >>> mu.append(mi('lim')) # >>> mu # munder(mi('lim'), mo('-'), accent=False) # >>> mu.append(mi('lim')) # Traceback (most recent call last): # SyntaxError: Node munder(mi('lim'), mo('-'), accent=False) already full! # >>> munder(mo('-'), mi('lim'), accent=False, switch=True).toprettyxml() # '\n lim\n -\n' class munderover(msubsup): pass class mroot(MathSchema): nchildren = 2 class mfrac(math): nchildren = 2 class mspace(math): nchildren = 0 # LaTeX to MathML translation # --------------------------- # auxiliary functions # ~~~~~~~~~~~~~~~~~~~ def tex_cmdname(string): """Return leading TeX command name and remainder of `string`. >>> tex_cmdname('mymacro2') # up to first non-letter ('mymacro', '2') >>> tex_cmdname('name 2') # strip trailing whitespace ('name', '2') >>> tex_cmdname('_2') # single non-letter character ('_', '2') """ m = re.match(r'([a-zA-Z]+) *(.*)', string) if m is None: m = re.match(r'(.?)(.*)', string) return m.group(1), m.group(2) # Test: # # >>> tex_cmdname('name_2') # first non-letter terminates # ('name', '_2') # >>> tex_cmdname(' next') # leading whitespace is returned # (' ', 'next') # >>> tex_cmdname('1 2') # whitespace after non-letter is kept # ('1', ' 2') # >>> tex_cmdname('') # empty string # ('', '') def tex_number(string): """Return leading number literal and remainder of `string`. >>> tex_number('123.4') ('123.4', '') """ m = re.match(r'([0-9.,]*[0-9]+)(.*)', string) if m is None: return '', string return m.group(1), m.group(2) # Test: # # >>> tex_number(' 23.4b') # leading whitespace -> no number # ('', ' 23.4b') # >>> tex_number('23,400/2') # comma separator included # ('23,400', '/2') # >>> tex_number('23. 4/2') # trailing separator not included # ('23', '. 4/2') # >>> tex_number('4, 2') # trailing separator not included # ('4', ', 2') # >>> tex_number('1 000.4') # ('1', ' 000.4') def tex_token(string): """Return first simple TeX token and remainder of `string`. >>> tex_token('\\command{without argument}') ('\\command', '{without argument}') >>> tex_token('or first character') ('o', 'r first character') """ m = re.match(r"""((?P\\[a-zA-Z]+)\s* # TeX command, skip whitespace |(?P\\.) # one-character TeX command |(?P.?)) # first character (or empty) (?P.*$) # remaining part of string """, string, re.VERBOSE) cmd, chcmd, ch, remainder = m.group('cmd', 'chcmd', 'ch', 'remainder') return cmd or chcmd or ch, remainder # Test: # # >>> tex_token('{opening bracket of group}') # ('{', 'opening bracket of group}') # >>> tex_token('\\skip whitespace after macro name') # ('\\skip', 'whitespace after macro name') # >>> tex_token('. but not after single char') # ('.', ' but not after single char') # >>> tex_token('') # empty string. # ('', '') # >>> tex_token('\{escaped bracket') # ('\\{', 'escaped bracket') def tex_group(string): """Return first TeX group or token and remainder of `string`. >>> tex_group('{first group} returned without brackets') ('first group', ' returned without brackets') """ split_index = 0 nest_level = 0 # level of {{nested} groups} escape = False # the next character is escaped (\) if not string.startswith('{'): # special case: there is no group, return first token and remainder return string[:1], string[1:] for c in string: split_index += 1 if escape: escape = False elif c == '\\': escape = True elif c == '{': nest_level += 1 elif c == '}': nest_level -= 1 if nest_level == 0: break else: raise SyntaxError('Group without closing bracket') return string[1:split_index-1], string[split_index:] # >>> tex_group('{} empty group') # ('', ' empty group') # >>> tex_group('{group with {nested} group} ') # ('group with {nested} group', ' ') # >>> tex_group('{group with {nested group}} at the end') # ('group with {nested group}', ' at the end') # >>> tex_group('{{group} {with {{complex }nesting}} constructs}') # ('{group} {with {{complex }nesting}} constructs', '') # >>> tex_group('{group with \\{escaped\\} brackets}') # ('group with \\{escaped\\} brackets', '') # >>> tex_group('{group followed by closing bracket}} from outer group') # ('group followed by closing bracket', '} from outer group') # >>> tex_group('No group? Return first character.') # ('N', 'o group? Return first character.') # >>> tex_group(' {also whitespace}') # (' ', '{also whitespace}') def tex_token_or_group(string): """Return first TeX group or token and remainder of `string`. >>> tex_token_or_group('\\command{without argument}') ('\\command', '{without argument}') >>> tex_token_or_group('first character') ('f', 'irst character') >>> tex_token_or_group(' also whitespace') (' ', 'also whitespace') >>> tex_token_or_group('{first group} keep rest') ('first group', ' keep rest') """ arg, remainder = tex_token(string) if arg == '{': arg, remainder = tex_group(string.lstrip()) return arg, remainder # >>> tex_token_or_group('\{no group but left bracket') # ('\\{', 'no group but left bracket') def tex_optarg(string): """Return optional argument and remainder. >>> tex_optarg('[optional argument] returned without brackets') ('optional argument', ' returned without brackets') >>> tex_optarg('{empty string, if there is no optional arg}') ('', '{empty string, if there is no optional arg}') """ m = re.match(r"""\s* # leading whitespace \[(?P(\\]|[^\[\]]|\\])*)\] # [group] without nested groups (?P.*$) """, string, re.VERBOSE) if m is None and not string.startswith('['): return '', string try: return m.group('optarg'), m.group('remainder') except AttributeError: raise SyntaxError('Could not extract optional argument from %r' % string) # Test: # >>> tex_optarg(' [optional argument] after whitespace') # ('optional argument', ' after whitespace') # >>> tex_optarg('[missing right bracket') # Traceback (most recent call last): # SyntaxError: Could not extract optional argument from '[missing right bracket' # >>> tex_optarg('[group with [nested group]]') # Traceback (most recent call last): # SyntaxError: Could not extract optional argument from '[group with [nested group]]' def parse_latex_math(node, string): """Append MathML conversion of `string` to `node` and return it. >>> parse_latex_math(math(), r'\alpha') math(mi('α')) >>> parse_latex_math(mrow(), r'x_{n}') mrow(msub(mi('x'), mi('n'))) """ # Normalize white-space: string = ' '.join(string.split()) tree = node while len(string) > 0: # Take off first character: c, string = string[0], string[1:] if c == ' ': continue # whitespace is ignored in LaTeX math mode if c == '\\': # start of a LaTeX macro cmdname, string = tex_cmdname(string) node, string = handle_cmd(cmdname, node, string) elif c in "_^": node = handle_script_or_limit(node, c) elif c == '{': new_node = mrow() node.append(new_node) node = new_node elif c == '}': node = node.close() elif c == '&': new_node = mtd() node.close().append(new_node) node = new_node elif c.isalpha(): node = node.append(mi(c)) elif c.isdigit(): number, string = tex_number(string) node = node.append(mn(c+number)) elif c in anomalous_chars: # characters with a special meaning in LaTeX math mode # fix spacing before "unary" minus. attributes = {} if c == '-' and node.children: previous_node = node.children[-1] if (getattr(previous_node, 'data', '-') in '([=' or previous_node.get('class') == 'mathopen'): attributes['form'] = 'prefix' node = node.append(mo(anomalous_chars[c], **attributes)) elif c in "/()[]|": node = node.append(mo(c, stretchy=False)) elif c in "+*=<>,.!?`';@": node = node.append(mo(c)) else: raise SyntaxError('Unsupported character: "%s"' % c) return tree # Test: # >>> print(parse_latex_math(math(), '')) # math() # >>> parse_latex_math(math(), ' \\sqrt{ \\alpha}') # math(msqrt(mi('α'))) # >>> parse_latex_math(math(), '23.4x') # math(mn('23.4'), mi('x')) # >>> parse_latex_math(math(), '\\sqrt 2 \\ne 3') # math(msqrt(mn('2')), mo('≠'), mn('3')) # >>> parse_latex_math(math(), '\\sqrt{2 + 3} < 3') # math(msqrt(mn('2'), mo('+'), mn('3')), mo('<'), mn('3')) # >>> parse_latex_math(math(), '\\sqrt[3]{2 + 3}') # math(mroot(mrow(mn('2'), mo('+'), mn('3')), mn('3'))) # >>> parse_latex_math(math(), '\max_x') # function takes limits # math(munder(mo('max', movablelimits=True), mi('x'))) # >>> parse_latex_math(math(), 'x^j_i') # ensure correct order: base, sub, sup # math(msubsup(mi('x'), mi('i'), mi('j'))) # >>> parse_latex_math(math(), '\int^j_i') # ensure correct order # math(msubsup(mo('∫'), mi('i'), mi('j'))) # >>> parse_latex_math(math(), 'x_{\\alpha}') # math(msub(mi('x'), mi('α'))) # >>> parse_latex_math(math(), 'x_\\text{in}') # math(msub(mi('x'), mtext('in'))) def handle_cmd(name, node, string): # noqa: C901 TODO make this less complex """Process LaTeX command `name` followed by `string`. Append result to `node`. If needed, parse `string` for command argument. Return new current node and remainder of `string`: >>> handle_cmd('hbar', math(), r' \frac') (math(mi('ℏ')), ' \\frac') >>> handle_cmd('hspace', math(), r'{1ex} (x)') (math(mspace(width='1ex')), ' (x)') """ # Token elements # ============== # identifier -> if name in letters: new_node = mi(letters[name]) if name in greek_capitals: # upright in "TeX style" but MathML sets them italic ("ISO style"). # CSS styling does not change the font style in Firefox 78. # Use 'mathvariant="normal"'? new_node['class'] = 'capital-greek' node = node.append(new_node) return node, string if name in functions: # use followed by invisible function applicator character # (see https://www.w3.org/TR/MathML3/chapter3.html#presm.mi) if name == 'operatorname': # custom function name, e.g. ``\operatorname{abs}(x)`` # TODO: \operatorname* -> with limits arg, string = tex_token_or_group(string) new_node = mi(arg, mathvariant='normal') else: new_node = mi(functions[name]) # embellished function names: if name == 'varliminf': # \underline\lim new_node = munder(new_node, mo('_')) elif name == 'varlimsup': # \overline\lim new_node = mover(new_node, mo('¯'), accent=False) elif name == 'varprojlim': # \underleftarrow\lim new_node = munder(new_node, mo('\u2190')) elif name == 'varinjlim': # \underrightarrow\lim new_node = munder(new_node, mo('\u2192')) node = node.append(new_node) # add ApplyFunction when appropriate (not \sin^2(x), say) # cf. https://www.w3.org/TR/MathML3/chapter3.html#presm.mi if string and string[0] not in ('^', '_'): node = node.append(mo('\u2061')) # ⁡ return node, string if name in modulo_functions: (binary, named, parentheses, padding) = modulo_functions[name] if binary: node = node.append(mo('mod', lspace=padding, rspace=padding)) return node, string # left padding if node.is_block(): padding = '1em' node = node.append(mspace(width=padding)) if parentheses: node = node.append(mo('(', stretchy=False)) if named: node = node.append(mi('mod')) node = node.append(mspace(width='0.333em')) arg, string = tex_token_or_group(string) node = parse_latex_math(node, arg) if parentheses: node = node.append(mo(')', stretchy=False)) return node, string if name in math_alphabets: if name == 'boldsymbol': attributes = {'class': 'boldsymbol'} else: attributes = {'mathvariant': math_alphabets[name]} if name == 'mathscr': attributes['class'] = 'mathscr' # Check for single symbol (letter, name, or ⅀) arg, remainder = tex_token_or_group(string) if arg.startswith('\\'): # convert single letters (so the isalpha() test below works). # TODO: convert all LICRs in a group (\matrm{\mu\Omega}) arg = letters.get(arg[1:], arg) if name == 'mathbb': # mathvariant="double-struck" is ignored for Greek letters # (tested in Firefox 78). Use literal Unicode characters. arg = mathbb.get(arg, arg) if arg.isalpha() or arg == '\u2140': node = node.append(mi(arg, **attributes)) return node, remainder # Wrap in