acl-org · mbollmann · Mar 5, 2025 · Mar 6, 2025 · Mar 6, 2025
diff --git a/python/acl_anthology/text/markuptext.py b/python/acl_anthology/text/markuptext.py
@@ -26,6 +26,7 @@
 from ..utils import (
     latex_encode,
     latex_convert_quotes,
+    parse_latex_to_xml,
     remove_extra_whitespace,
     stringify_children,
 )
@@ -181,6 +182,18 @@ def from_string(cls, text: str) -> MarkupText:
         """
         return cls(text)
 
+    @classmethod
+    def from_latex(cls, text: str) -> MarkupText:
+        """
+        Arguments:
+            text: A text string potentially containing LaTeX markup.
+
+        Returns:
+            Instantiated MarkupText object corresponding to the string.
+        """
+        element = parse_latex_to_xml(text)
+        return cls.from_xml(element)
+
     @classmethod
     def from_xml(cls, element: etree._Element) -> MarkupText:
         """

diff --git a/python/acl_anthology/utils/__init__.py b/python/acl_anthology/utils/__init__.py
@@ -15,7 +15,12 @@
 from .citation import citeproc_render_html
 from .git import clone_or_pull_from_repo
 from .ids import build_id, parse_id, AnthologyID
-from .latex import latex_encode, latex_convert_quotes, make_bibtex_entry
+from .latex import (
+    latex_encode,
+    latex_convert_quotes,
+    make_bibtex_entry,
+    parse_latex_to_xml,
+)
 from .logging import setup_rich_logging, get_logger
 from .text import remove_extra_whitespace
 from .xml import stringify_children
@@ -31,6 +36,7 @@
     "latex_convert_quotes",
     "make_bibtex_entry",
     "parse_id",
+    "parse_latex_to_xml",
     "remove_extra_whitespace",
     "setup_rich_logging",
     "stringify_children",

diff --git a/python/acl_anthology/utils/latex.py b/python/acl_anthology/utils/latex.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Functions implementing the conversion to LaTeX/BibTeX formats."""
+"""Functions implementing conversions to and from LaTeX/BibTeX formats."""
 
 from __future__ import annotations
 
 import re
 from functools import lru_cache
+from lxml import etree
 from typing import cast, Optional, TypeAlias, TYPE_CHECKING
 
 if TYPE_CHECKING:
@@ -27,12 +28,34 @@
     SerializableAsBibTeX: TypeAlias = None | str | MarkupText | list[NameSpecification]
     """Any type that can be supplied to `make_bibtex_entry`."""
 
+from .logging import get_logger
+from .xml import append_text
 
 from pylatexenc.latexencode import (
     UnicodeToLatexEncoder,
     UnicodeToLatexConversionRule,
     RULE_DICT,
 )
+from pylatexenc.latexwalker import (
+    LatexWalker,
+    LatexNode,
+    LatexCharsNode,
+    LatexGroupNode,
+    LatexMacroNode,
+    LatexMathNode,
+    LatexSpecialsNode,
+)
+from pylatexenc.latex2text import (
+    LatexNodes2Text,
+    MacroTextSpec,
+    get_default_latex_context_db,
+)
+
+log = get_logger()
+
+################################################################################
+### UNICODE TO LATEX (BIBTEX)
+################################################################################
 
 LATEXENC = UnicodeToLatexEncoder(
     conversion_rules=[
@@ -54,6 +77,7 @@
     unknown_char_policy="keep",
     unknown_char_warning=False,
 )
+"""A UnicodeToLatexEncoder instance intended for BibTeX generation."""
 
 BIBTEX_FIELD_NEEDS_ENCODING = {"journal", "address", "publisher", "note"}
 """Any BibTeX field whose value should be LaTeX-encoded first."""
@@ -211,3 +235,167 @@
         A BibTeX-formatted string representing the given names.
     """
     return "  and\n      ".join(spec.name.as_bibtex() for spec in namespecs)
+
+
+################################################################################
+### LATEX TO UNICODE/XML
+################################################################################
+
+LATEX_MACRO_TO_XMLTAG = {
+    "emph": "i",
+    "em": "i",
+    "textit": "i",
+    "it": "i",
+    "textsl": "i",
+    "sl": "i",
+    "textbf": "b",
+    "bf": "b",
+    "url": "url",
+}
+LATEX_CITE_MACROS = {"cite", "citep", "citet", "newcite", "citeauthor", "citeyear"}
+L2T_CONTEXT = get_default_latex_context_db()
+L2T_CONTEXT.add_context_category(
+    "citations",
+    prepend=True,
+    macros=[
+        MacroTextSpec(macro, simplify_repl=r"(CITATION)") for macro in LATEX_CITE_MACROS
+    ],
+)
+LATEX_TO_TEXT = LatexNodes2Text(strict_latex_spaces=True, latex_context=L2T_CONTEXT)
+
+
+def _is_trivial_math(node: LatexMathNode) -> bool:
+    """Helper function to determine whether or not a LatexMathNode contains only 'trivial' content that doesn't require a <tex-math> node."""
+    content = node.latex_verbatim().strip("$").replace(r"\%", "%")
+    return all(c.isspace() or c.isdigit() or c in (".,@%~") for c in content)
+
+
+def _should_parse_macro_as_text(node: LatexMacroNode) -> bool:
+    """Helper function to determine whether or not a LatexMacroNode should be parsed as a simple character macro."""
+    subnodes = node.nodeargd.argnlist
+    if len(subnodes) == 0:
+        # Macro without arguments; e.g. \i or \l
+        return True
+    elif len(subnodes) > 1:
+        # Macro with more than one argument
+        return False
+    subnode = subnodes[0]
+    if subnode.isNodeType(LatexCharsNode) and subnode.len == 1:
+        return True
+    if (
+        subnode.isNodeType(LatexGroupNode)
+        and len(subnode.nodelist) == 1
+        and subnode.nodelist[0].isNodeType(LatexCharsNode)
+        and subnode.nodelist[0].len == 1
+    ):
+        return True
+    return False
+
+
+def _should_wrap_in_fixed_case(node: LatexGroupNode) -> bool:
+    """Helper function to determine whether or not a LatexGroupNode should produce a <fixed-case> tag."""
+    if len(node.nodelist) == 0 or node.delimiters != ("{", "}"):
+        return False
+    if node.latex_verbatim().startswith("{\\"):
+        # {\...} does *not* protect case
+        return False
+    if node.nodelist[0].isNodeType(LatexMathNode):
+        # Don't mark {$...$}
+        return False
+    if node.nodelist[0].isNodeType(LatexSpecialsNode):
+        # Don't mark {``}, {--}, etc.
+        return False
+    return True
+
+
+def _parse_nodelist_to_element(
+    nodelist: list[LatexNode],
+    element: etree._Element,
+    use_fixed_case: bool,
+    in_macro: bool = False,
+) -> None:
+    """Parse a list of LaTeX nodes into an XML element using the Anthology markup format.
+
+    Arguments:
+        nodelist: The list of parsed LaTeX nodes.
+        element: An XML element into which the parsed nodes will be added.
+        use_fixed_case: Flag indicating whether <fixed-case> protection should be applied.
+        in_macro: Flag indicating whether this function was called by recursing into a macro node. (Do not set this manually.)
+
+    Returns:
+        None; the XML element is modified in-place.
+    """
+    for node in nodelist:
+        if node is None:
+            continue
+        elif node.isNodeType(LatexCharsNode):
+            # Plain text
+            append_text(element, node.chars)
+        elif node.isNodeType(LatexMacroNode):
+            # LaTeX macro
+            if (tag := LATEX_MACRO_TO_XMLTAG.get(node.macroname)) is not None:
+                # This macro should get its own XML tag (e.g. \textbf -> <b>)
+                subelem = etree.SubElement(element, tag)
+                subnodes = node.nodeargd.argnlist
+                _parse_nodelist_to_element(
+                    subnodes, subelem, use_fixed_case, in_macro=True
+                )
+            elif node.macroname in LATEX_CITE_MACROS:
+                # A citation command such as \cite{...}
+                append_text(element, LATEX_TO_TEXT.macro_node_to_text(node))
+            elif _should_parse_macro_as_text(node):
+                # This macro should be parsed as text because it probably
+                # represents a special character, such as \v{c} or \"I
+                append_text(element, LATEX_TO_TEXT.macro_node_to_text(node))
+            else:
+                # This is a macro we don't know how to handle - emit warning,
+                # then discard macro but recurse into its children
+                log.warning(f"Unhandled LaTeX macro '{node.macroname}'")
+                subnodes = node.nodeargd.argnlist
+                _parse_nodelist_to_element(
+                    subnodes, element, use_fixed_case, in_macro=True
+                )
+        elif node.isNodeType(LatexGroupNode):
+            # Bracketed group, such as {...} or [...]
+            if not in_macro and _should_wrap_in_fixed_case(node):
+                # Protect this with <fixed-case>, then recurse
+                subelem = etree.SubElement(element, "fixed-case")
+                _parse_nodelist_to_element(node.nodelist, subelem, False)
+            elif node.delimiters == ("{", "}"):
+                # Just recurse
+                _parse_nodelist_to_element(node.nodelist, element, use_fixed_case)
+            else:
+                # Skip [...] or <...> groups
+                pass
+        elif node.isNodeType(LatexMathNode):
+            # Math node
+            if _is_trivial_math(node):
+                # Just append as text
+                append_text(element, LATEX_TO_TEXT.math_node_to_text(node))
+            else:
+                # Keep verbatim, but wrap in <tex-math>
+                subelem = etree.SubElement(element, "tex-math")
+                subelem.text = node.latex_verbatim().strip("$")
+        elif node.isNodeType(LatexSpecialsNode):
+            # TODO: Is this always the correct way?
+            append_text(element, LATEX_TO_TEXT.specials_node_to_text(node))
+        else:
+            # Comments or environments
+            log.warning(f"Unhandled node type: {node.nodeType}")
+
+
+def parse_latex_to_xml(latex_input: str, use_fixed_case: bool = True) -> etree._Element:
+    """Convert a string with LaTeX markup into the Anthology XML format.
+
+    Arguments:
+        latex_input: A string potentially including LaTeX markup.
+        use_fixed_case: Flag indicating whether <fixed-case> protection should be applied.
+
+    Returns:
+        An XML element representing the given LaTeX input in the Anthology XML format for markup strings.
+    """
+    element = etree.Element("root")
+    walker = LatexWalker(latex_input)
+    nodelist, *_ = walker.get_latex_nodes()
+    _parse_nodelist_to_element(nodelist, element, use_fixed_case)
+    return element
diff --git a/python/acl_anthology/utils/xml.py b/python/acl_anthology/utils/xml.py
@@ -81,6 +81,30 @@ def assert_equals(elem: etree._Element, other: etree._Element) -> None:
             assert_equals(elem_child, other_child)
 
 
+def append_text(elem: etree._Element, text: str) -> None:
+    """Append text to an XML element.
+
+    If the XML element has children, the text will be appended to the tail of the last child; otherwise, it will be appended to its text attribute.
+
+    Arguments:
+        elem: The XML element.
+        text: The text string to append to the XML element.
+
+    Returns:
+        None; the XML element is modified in-place.
+    """
+    if len(elem):
+        # already has children — append text to tail
+        if elem[-1].tail is not None:
+            elem[-1].tail = "".join((elem[-1].tail, text))
+        else:
+            elem[-1].tail = text
+    elif elem.text is not None:
+        elem.text = "".join((elem.text, text))
+    else:
+        elem.text = text
+
+
 def clean_whitespace(
     text: Optional[str], func: Optional[Callable[[str], str]] = None
 ) -> Optional[str]: