From fbcde65b383900a9728065861b93b8d3ada6ba1c Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Tue, 27 Feb 2024 14:52:31 +0100 Subject: [PATCH 1/6] add functionality to exclude tags from extraction and normalize space --- src/fundus/parser/utility.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py index 7a9c1a335..e9b4b302a 100644 --- a/src/fundus/parser/utility.py +++ b/src/fundus/parser/utility.py @@ -17,7 +17,6 @@ cast, ) -import dateutil.tz import lxml.html import more_itertools from dateutil import parser @@ -27,6 +26,10 @@ from fundus.parser.data import ArticleBody, ArticleSection, TextSequence +def normalize_whitespace(text: str) -> str: + return " ".join(text.split()) + + @total_ordering @dataclass(eq=False) class Node: @@ -34,8 +37,21 @@ class Node: node: lxml.html.HtmlElement = field(compare=False) _break_selector: ClassVar[XPath] = XPath("*//br") - def striped(self, chars: Optional[str] = None) -> str: - return str(self).strip(chars) + # one could replace this recursion with XPath using an expression like this: + # //*[not(self::script) and text()]/text(), but for whatever reason, that's actually 50-150% slower + # than simply using the implemented mixture below + def text_content(self, excluded_tags: Optional[List[str]] = None) -> str: + excluded_tags = excluded_tags or [] + + def _text_content(element: lxml.html.HtmlElement) -> str: + if element.tag in excluded_tags: + return "" + text = element.text or "" if not isinstance(element, lxml.html.HtmlComment) else "" + children = "".join([_text_content(child) for child in element.iterchildren()]) or "" + tail = element.tail or "" + return text + children + tail + + return _text_content(self._get_break_preserved_node()) def _get_break_preserved_node(self) -> lxml.html.HtmlElement: copied_node = copy(self.node) @@ -55,10 +71,10 @@ def __hash__(self) -> int: return hash(self.position) def __str__(self) -> str: - return self._get_break_preserved_node().text_content() + return self.text_content() def __bool__(self): - return bool(self.striped()) + return bool(normalize_whitespace(self.text_content())) class SummaryNode(Node): @@ -106,13 +122,13 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]: first = next(instructions) instructions = itertools.chain([first, []], instructions) - summary = TextSequence(map(lambda x: x.striped("\n"), next(instructions))) + summary = TextSequence(map(lambda x: normalize_whitespace(x.text_content(["script"])), next(instructions))) sections: List[ArticleSection] = [] for chunk in more_itertools.chunked(instructions, 2): if len(chunk) == 1: chunk.append([]) - texts = [list(map(lambda x: x.striped("\n"), c)) for c in chunk] + texts = [list(map(lambda x: normalize_whitespace(x.text_content(["script"])), c)) for c in chunk] sections.append(ArticleSection(*map(TextSequence, texts))) return ArticleBody(summary=summary, sections=sections) From 95ca9acbf1255df7a96854e8f22fcb667434acfa Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Tue, 27 Feb 2024 15:21:13 +0100 Subject: [PATCH 2/6] remove leftover strip --- src/fundus/parser/data.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py index 1104176e4..f50c1a111 100644 --- a/src/fundus/parser/data.py +++ b/src/fundus/parser/data.py @@ -199,12 +199,8 @@ def as_text_sequence(self) -> TextSequence: texts = [text for tl in self.df_traversal() for text in tl] return TextSequence(texts) - def text(self, join_on: str = "\n\n", strip_text: bool = True) -> str: - if strip_text: - striped_texts = [" ".join(text.split()) for text in self.as_text_sequence()] - return join_on.join(striped_texts) - else: - return join_on.join(self.as_text_sequence()) + def text(self, join_on: str = "\n\n") -> str: + return join_on.join(self.as_text_sequence()) def df_traversal(self) -> Iterable[TextSequence]: def recursion(o: object): From 46167a504cae9365ad94bca60ca39118ac417157 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Tue, 27 Feb 2024 15:21:22 +0100 Subject: [PATCH 3/6] fix mypy --- src/fundus/parser/utility.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py index e9b4b302a..d2f5cbfc4 100644 --- a/src/fundus/parser/utility.py +++ b/src/fundus/parser/utility.py @@ -41,10 +41,10 @@ class Node: # //*[not(self::script) and text()]/text(), but for whatever reason, that's actually 50-150% slower # than simply using the implemented mixture below def text_content(self, excluded_tags: Optional[List[str]] = None) -> str: - excluded_tags = excluded_tags or [] + guarded_excluded_tags: List[str] = excluded_tags or [] def _text_content(element: lxml.html.HtmlElement) -> str: - if element.tag in excluded_tags: + if element.tag in guarded_excluded_tags: return "" text = element.text or "" if not isinstance(element, lxml.html.HtmlComment) else "" children = "".join([_text_content(child) for child in element.iterchildren()]) or "" From 5081a740fa54e02a4f6a73c136954a771830089e Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Fri, 1 Mar 2024 12:36:22 +0100 Subject: [PATCH 4/6] Update src/fundus/parser/utility.py Co-authored-by: Conrad Dobberstein <29147025+dobbersc@users.noreply.github.com> --- src/fundus/parser/utility.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py index d2f5cbfc4..3d044521a 100644 --- a/src/fundus/parser/utility.py +++ b/src/fundus/parser/utility.py @@ -47,7 +47,7 @@ def _text_content(element: lxml.html.HtmlElement) -> str: if element.tag in guarded_excluded_tags: return "" text = element.text or "" if not isinstance(element, lxml.html.HtmlComment) else "" - children = "".join([_text_content(child) for child in element.iterchildren()]) or "" + children = "".join([_text_content(child) for child in element.iterchildren()]) tail = element.tail or "" return text + children + tail From 15d3311a87a576100749e62a0f6d0337719929e5 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Fri, 1 Mar 2024 12:38:02 +0100 Subject: [PATCH 5/6] Apply suggestions from code review Co-authored-by: Conrad Dobberstein <29147025+dobbersc@users.noreply.github.com> --- src/fundus/parser/utility.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py index 3d044521a..0d387f923 100644 --- a/src/fundus/parser/utility.py +++ b/src/fundus/parser/utility.py @@ -122,13 +122,13 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]: first = next(instructions) instructions = itertools.chain([first, []], instructions) - summary = TextSequence(map(lambda x: normalize_whitespace(x.text_content(["script"])), next(instructions))) + summary = TextSequence(map(lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"])), next(instructions))) sections: List[ArticleSection] = [] for chunk in more_itertools.chunked(instructions, 2): if len(chunk) == 1: chunk.append([]) - texts = [list(map(lambda x: normalize_whitespace(x.text_content(["script"])), c)) for c in chunk] + texts = [list(map(lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"])), c)) for c in chunk] sections.append(ArticleSection(*map(TextSequence, texts))) return ArticleBody(summary=summary, sections=sections) From 108f05a35199955983837932949afc16456925e1 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Fri, 1 Mar 2024 12:53:57 +0100 Subject: [PATCH 6/6] black --- src/fundus/parser/utility.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py index 0d387f923..390d9f8cc 100644 --- a/src/fundus/parser/utility.py +++ b/src/fundus/parser/utility.py @@ -122,7 +122,9 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]: first = next(instructions) instructions = itertools.chain([first, []], instructions) - summary = TextSequence(map(lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"])), next(instructions))) + summary = TextSequence( + map(lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"])), next(instructions)) + ) sections: List[ArticleSection] = [] for chunk in more_itertools.chunked(instructions, 2):