Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement conversion from LaTeX to our Markup XML #4787

Draft
wants to merge 3 commits into
base: python-dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions python/acl_anthology/text/markuptext.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from ..utils import (
latex_encode,
latex_convert_quotes,
parse_latex_to_xml,
remove_extra_whitespace,
stringify_children,
)
Expand Down Expand Up @@ -181,6 +182,18 @@ def from_string(cls, text: str) -> MarkupText:
"""
return cls(text)

@classmethod
def from_latex(cls, text: str) -> MarkupText:
"""
Arguments:
text: A text string potentially containing LaTeX markup.

Returns:
Instantiated MarkupText object corresponding to the string.
"""
element = parse_latex_to_xml(text)
return cls.from_xml(element)

@classmethod
def from_xml(cls, element: etree._Element) -> MarkupText:
"""
Expand Down
8 changes: 7 additions & 1 deletion python/acl_anthology/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
from .citation import citeproc_render_html
from .git import clone_or_pull_from_repo
from .ids import build_id, parse_id, AnthologyID
from .latex import latex_encode, latex_convert_quotes, make_bibtex_entry
from .latex import (
latex_encode,
latex_convert_quotes,
make_bibtex_entry,
parse_latex_to_xml,
)
from .logging import setup_rich_logging, get_logger
from .text import remove_extra_whitespace
from .xml import stringify_children
Expand All @@ -31,6 +36,7 @@
"latex_convert_quotes",
"make_bibtex_entry",
"parse_id",
"parse_latex_to_xml",
"remove_extra_whitespace",
"setup_rich_logging",
"stringify_children",
Expand Down
190 changes: 189 additions & 1 deletion python/acl_anthology/utils/latex.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.

"""Functions implementing the conversion to LaTeX/BibTeX formats."""
"""Functions implementing conversions to and from LaTeX/BibTeX formats."""

from __future__ import annotations

import re
from functools import lru_cache
from lxml import etree
from typing import cast, Optional, TypeAlias, TYPE_CHECKING

if TYPE_CHECKING:
Expand All @@ -27,12 +28,34 @@
SerializableAsBibTeX: TypeAlias = None | str | MarkupText | list[NameSpecification]
"""Any type that can be supplied to `make_bibtex_entry`."""

from .logging import get_logger
from .xml import append_text

from pylatexenc.latexencode import (
UnicodeToLatexEncoder,
UnicodeToLatexConversionRule,
RULE_DICT,
)
from pylatexenc.latexwalker import (
LatexWalker,
LatexNode,
LatexCharsNode,
LatexGroupNode,
LatexMacroNode,
LatexMathNode,
LatexSpecialsNode,
)
from pylatexenc.latex2text import (
LatexNodes2Text,
MacroTextSpec,
get_default_latex_context_db,
)

log = get_logger()

################################################################################
### UNICODE TO LATEX (BIBTEX)
################################################################################

LATEXENC = UnicodeToLatexEncoder(
conversion_rules=[
Expand All @@ -54,6 +77,7 @@
unknown_char_policy="keep",
unknown_char_warning=False,
)
"""A UnicodeToLatexEncoder instance intended for BibTeX generation."""

BIBTEX_FIELD_NEEDS_ENCODING = {"journal", "address", "publisher", "note"}
"""Any BibTeX field whose value should be LaTeX-encoded first."""
Expand Down Expand Up @@ -211,3 +235,167 @@
A BibTeX-formatted string representing the given names.
"""
return " and\n ".join(spec.name.as_bibtex() for spec in namespecs)


################################################################################
### LATEX TO UNICODE/XML
################################################################################

LATEX_MACRO_TO_XMLTAG = {
"emph": "i",
"em": "i",
"textit": "i",
"it": "i",
"textsl": "i",
"sl": "i",
"textbf": "b",
"bf": "b",
"url": "url",
}
LATEX_CITE_MACROS = {"cite", "citep", "citet", "newcite", "citeauthor", "citeyear"}
L2T_CONTEXT = get_default_latex_context_db()
L2T_CONTEXT.add_context_category(
"citations",
prepend=True,
macros=[
MacroTextSpec(macro, simplify_repl=r"(CITATION)") for macro in LATEX_CITE_MACROS
],
)
LATEX_TO_TEXT = LatexNodes2Text(strict_latex_spaces=True, latex_context=L2T_CONTEXT)


def _is_trivial_math(node: LatexMathNode) -> bool:
"""Helper function to determine whether or not a LatexMathNode contains only 'trivial' content that doesn't require a <tex-math> node."""
content = node.latex_verbatim().strip("$").replace(r"\%", "%")
return all(c.isspace() or c.isdigit() or c in (".,@%~") for c in content)


def _should_parse_macro_as_text(node: LatexMacroNode) -> bool:
"""Helper function to determine whether or not a LatexMacroNode should be parsed as a simple character macro."""
subnodes = node.nodeargd.argnlist
if len(subnodes) == 0:
# Macro without arguments; e.g. \i or \l
return True
elif len(subnodes) > 1:
# Macro with more than one argument
return False

Check warning on line 281 in python/acl_anthology/utils/latex.py

View check run for this annotation

Codecov / codecov/patch

python/acl_anthology/utils/latex.py#L281

Added line #L281 was not covered by tests
subnode = subnodes[0]
if subnode.isNodeType(LatexCharsNode) and subnode.len == 1:
return True
if (
subnode.isNodeType(LatexGroupNode)
and len(subnode.nodelist) == 1
and subnode.nodelist[0].isNodeType(LatexCharsNode)
and subnode.nodelist[0].len == 1
):
return True
return False


def _should_wrap_in_fixed_case(node: LatexGroupNode) -> bool:
"""Helper function to determine whether or not a LatexGroupNode should produce a <fixed-case> tag."""
if len(node.nodelist) == 0 or node.delimiters != ("{", "}"):
return False

Check warning on line 298 in python/acl_anthology/utils/latex.py

View check run for this annotation

Codecov / codecov/patch

python/acl_anthology/utils/latex.py#L298

Added line #L298 was not covered by tests
if node.latex_verbatim().startswith("{\\"):
# {\...} does *not* protect case
return False
if node.nodelist[0].isNodeType(LatexMathNode):
# Don't mark {$...$}
return False

Check warning on line 304 in python/acl_anthology/utils/latex.py

View check run for this annotation

Codecov / codecov/patch

python/acl_anthology/utils/latex.py#L304

Added line #L304 was not covered by tests
if node.nodelist[0].isNodeType(LatexSpecialsNode):
# Don't mark {``}, {--}, etc.
return False
return True


def _parse_nodelist_to_element(
nodelist: list[LatexNode],
element: etree._Element,
use_fixed_case: bool,
in_macro: bool = False,
) -> None:
"""Parse a list of LaTeX nodes into an XML element using the Anthology markup format.

Arguments:
nodelist: The list of parsed LaTeX nodes.
element: An XML element into which the parsed nodes will be added.
use_fixed_case: Flag indicating whether <fixed-case> protection should be applied.
in_macro: Flag indicating whether this function was called by recursing into a macro node. (Do not set this manually.)

Returns:
None; the XML element is modified in-place.
"""
for node in nodelist:
if node is None:
continue

Check warning on line 330 in python/acl_anthology/utils/latex.py

View check run for this annotation

Codecov / codecov/patch

python/acl_anthology/utils/latex.py#L330

Added line #L330 was not covered by tests
elif node.isNodeType(LatexCharsNode):
# Plain text
append_text(element, node.chars)
elif node.isNodeType(LatexMacroNode):
# LaTeX macro
if (tag := LATEX_MACRO_TO_XMLTAG.get(node.macroname)) is not None:
# This macro should get its own XML tag (e.g. \textbf -> <b>)
subelem = etree.SubElement(element, tag)
subnodes = node.nodeargd.argnlist
_parse_nodelist_to_element(
subnodes, subelem, use_fixed_case, in_macro=True
)
elif node.macroname in LATEX_CITE_MACROS:
# A citation command such as \cite{...}
append_text(element, LATEX_TO_TEXT.macro_node_to_text(node))
elif _should_parse_macro_as_text(node):
# This macro should be parsed as text because it probably
# represents a special character, such as \v{c} or \"I
append_text(element, LATEX_TO_TEXT.macro_node_to_text(node))
else:
# This is a macro we don't know how to handle - emit warning,
# then discard macro but recurse into its children
log.warning(f"Unhandled LaTeX macro '{node.macroname}'")
subnodes = node.nodeargd.argnlist
_parse_nodelist_to_element(
subnodes, element, use_fixed_case, in_macro=True
)
elif node.isNodeType(LatexGroupNode):
# Bracketed group, such as {...} or [...]
if not in_macro and _should_wrap_in_fixed_case(node):
# Protect this with <fixed-case>, then recurse
subelem = etree.SubElement(element, "fixed-case")
_parse_nodelist_to_element(node.nodelist, subelem, False)
elif node.delimiters == ("{", "}"):
# Just recurse
_parse_nodelist_to_element(node.nodelist, element, use_fixed_case)
else:
# Skip [...] or <...> groups
pass

Check warning on line 369 in python/acl_anthology/utils/latex.py

View check run for this annotation

Codecov / codecov/patch

python/acl_anthology/utils/latex.py#L369

Added line #L369 was not covered by tests
elif node.isNodeType(LatexMathNode):
# Math node
if _is_trivial_math(node):
# Just append as text
append_text(element, LATEX_TO_TEXT.math_node_to_text(node))
else:
# Keep verbatim, but wrap in <tex-math>
subelem = etree.SubElement(element, "tex-math")
subelem.text = node.latex_verbatim().strip("$")
elif node.isNodeType(LatexSpecialsNode):
# TODO: Is this always the correct way?
append_text(element, LATEX_TO_TEXT.specials_node_to_text(node))
else:
# Comments or environments
log.warning(f"Unhandled node type: {node.nodeType}")

Check warning on line 384 in python/acl_anthology/utils/latex.py

View check run for this annotation

Codecov / codecov/patch

python/acl_anthology/utils/latex.py#L384

Added line #L384 was not covered by tests


def parse_latex_to_xml(latex_input: str, use_fixed_case: bool = True) -> etree._Element:
"""Convert a string with LaTeX markup into the Anthology XML format.

Arguments:
latex_input: A string potentially including LaTeX markup.
use_fixed_case: Flag indicating whether <fixed-case> protection should be applied.

Returns:
An XML element representing the given LaTeX input in the Anthology XML format for markup strings.
"""
element = etree.Element("root")
walker = LatexWalker(latex_input)
nodelist, *_ = walker.get_latex_nodes()
_parse_nodelist_to_element(nodelist, element, use_fixed_case)
return element
24 changes: 24 additions & 0 deletions python/acl_anthology/utils/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,30 @@ def assert_equals(elem: etree._Element, other: etree._Element) -> None:
assert_equals(elem_child, other_child)


def append_text(elem: etree._Element, text: str) -> None:
"""Append text to an XML element.

If the XML element has children, the text will be appended to the tail of the last child; otherwise, it will be appended to its text attribute.

Arguments:
elem: The XML element.
text: The text string to append to the XML element.

Returns:
None; the XML element is modified in-place.
"""
if len(elem):
# already has children — append text to tail
if elem[-1].tail is not None:
elem[-1].tail = "".join((elem[-1].tail, text))
else:
elem[-1].tail = text
elif elem.text is not None:
elem.text = "".join((elem.text, text))
else:
elem.text = text


def clean_whitespace(
text: Optional[str], func: Optional[Callable[[str], str]] = None
) -> Optional[str]:
Expand Down
Loading