From fbcde65b383900a9728065861b93b8d3ada6ba1c Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Tue, 27 Feb 2024 14:52:31 +0100
Subject: [PATCH 1/6] add functionality to exclude tags from extraction and
 normalize space

---
 src/fundus/parser/utility.py | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
index 7a9c1a335..e9b4b302a 100644
--- a/src/fundus/parser/utility.py
+++ b/src/fundus/parser/utility.py
@@ -17,7 +17,6 @@
     cast,
 )
 
-import dateutil.tz
 import lxml.html
 import more_itertools
 from dateutil import parser
@@ -27,6 +26,10 @@
 from fundus.parser.data import ArticleBody, ArticleSection, TextSequence
 
 
+def normalize_whitespace(text: str) -> str:
+    return " ".join(text.split())
+
+
 @total_ordering
 @dataclass(eq=False)
 class Node:
@@ -34,8 +37,21 @@ class Node:
     node: lxml.html.HtmlElement = field(compare=False)
     _break_selector: ClassVar[XPath] = XPath("*//br")
 
-    def striped(self, chars: Optional[str] = None) -> str:
-        return str(self).strip(chars)
+    # one could replace this recursion with XPath using an expression like this:
+    # //*[not(self::script) and text()]/text(), but for whatever reason, that's actually 50-150% slower
+    # than simply using the implemented mixture below
+    def text_content(self, excluded_tags: Optional[List[str]] = None) -> str:
+        excluded_tags = excluded_tags or []
+
+        def _text_content(element: lxml.html.HtmlElement) -> str:
+            if element.tag in excluded_tags:
+                return ""
+            text = element.text or "" if not isinstance(element, lxml.html.HtmlComment) else ""
+            children = "".join([_text_content(child) for child in element.iterchildren()]) or ""
+            tail = element.tail or ""
+            return text + children + tail
+
+        return _text_content(self._get_break_preserved_node())
 
     def _get_break_preserved_node(self) -> lxml.html.HtmlElement:
         copied_node = copy(self.node)
@@ -55,10 +71,10 @@ def __hash__(self) -> int:
         return hash(self.position)
 
     def __str__(self) -> str:
-        return self._get_break_preserved_node().text_content()
+        return self.text_content()
 
     def __bool__(self):
-        return bool(self.striped())
+        return bool(normalize_whitespace(self.text_content()))
 
 
 class SummaryNode(Node):
@@ -106,13 +122,13 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]:
         first = next(instructions)
         instructions = itertools.chain([first, []], instructions)
 
-    summary = TextSequence(map(lambda x: x.striped("\n"), next(instructions)))
+    summary = TextSequence(map(lambda x: normalize_whitespace(x.text_content(["script"])), next(instructions)))
     sections: List[ArticleSection] = []
 
     for chunk in more_itertools.chunked(instructions, 2):
         if len(chunk) == 1:
             chunk.append([])
-        texts = [list(map(lambda x: x.striped("\n"), c)) for c in chunk]
+        texts = [list(map(lambda x: normalize_whitespace(x.text_content(["script"])), c)) for c in chunk]
         sections.append(ArticleSection(*map(TextSequence, texts)))
 
     return ArticleBody(summary=summary, sections=sections)

From 95ca9acbf1255df7a96854e8f22fcb667434acfa Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Tue, 27 Feb 2024 15:21:13 +0100
Subject: [PATCH 2/6] remove leftover strip

---
 src/fundus/parser/data.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
index 1104176e4..f50c1a111 100644
--- a/src/fundus/parser/data.py
+++ b/src/fundus/parser/data.py
@@ -199,12 +199,8 @@ def as_text_sequence(self) -> TextSequence:
         texts = [text for tl in self.df_traversal() for text in tl]
         return TextSequence(texts)
 
-    def text(self, join_on: str = "\n\n", strip_text: bool = True) -> str:
-        if strip_text:
-            striped_texts = [" ".join(text.split()) for text in self.as_text_sequence()]
-            return join_on.join(striped_texts)
-        else:
-            return join_on.join(self.as_text_sequence())
+    def text(self, join_on: str = "\n\n") -> str:
+        return join_on.join(self.as_text_sequence())
 
     def df_traversal(self) -> Iterable[TextSequence]:
         def recursion(o: object):

From 46167a504cae9365ad94bca60ca39118ac417157 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Tue, 27 Feb 2024 15:21:22 +0100
Subject: [PATCH 3/6] fix mypy

---
 src/fundus/parser/utility.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
index e9b4b302a..d2f5cbfc4 100644
--- a/src/fundus/parser/utility.py
+++ b/src/fundus/parser/utility.py
@@ -41,10 +41,10 @@ class Node:
     # //*[not(self::script) and text()]/text(), but for whatever reason, that's actually 50-150% slower
     # than simply using the implemented mixture below
     def text_content(self, excluded_tags: Optional[List[str]] = None) -> str:
-        excluded_tags = excluded_tags or []
+        guarded_excluded_tags: List[str] = excluded_tags or []
 
         def _text_content(element: lxml.html.HtmlElement) -> str:
-            if element.tag in excluded_tags:
+            if element.tag in guarded_excluded_tags:
                 return ""
             text = element.text or "" if not isinstance(element, lxml.html.HtmlComment) else ""
             children = "".join([_text_content(child) for child in element.iterchildren()]) or ""

From 5081a740fa54e02a4f6a73c136954a771830089e Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Fri, 1 Mar 2024 12:36:22 +0100
Subject: [PATCH 4/6] Update src/fundus/parser/utility.py

Co-authored-by: Conrad Dobberstein <29147025+dobbersc@users.noreply.github.com>
---
 src/fundus/parser/utility.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
index d2f5cbfc4..3d044521a 100644
--- a/src/fundus/parser/utility.py
+++ b/src/fundus/parser/utility.py
@@ -47,7 +47,7 @@ def _text_content(element: lxml.html.HtmlElement) -> str:
             if element.tag in guarded_excluded_tags:
                 return ""
             text = element.text or "" if not isinstance(element, lxml.html.HtmlComment) else ""
-            children = "".join([_text_content(child) for child in element.iterchildren()]) or ""
+            children = "".join([_text_content(child) for child in element.iterchildren()])
             tail = element.tail or ""
             return text + children + tail
 

From 15d3311a87a576100749e62a0f6d0337719929e5 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Fri, 1 Mar 2024 12:38:02 +0100
Subject: [PATCH 5/6] Apply suggestions from code review

Co-authored-by: Conrad Dobberstein <29147025+dobbersc@users.noreply.github.com>
---
 src/fundus/parser/utility.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
index 3d044521a..0d387f923 100644
--- a/src/fundus/parser/utility.py
+++ b/src/fundus/parser/utility.py
@@ -122,13 +122,13 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]:
         first = next(instructions)
         instructions = itertools.chain([first, []], instructions)
 
-    summary = TextSequence(map(lambda x: normalize_whitespace(x.text_content(["script"])), next(instructions)))
+    summary = TextSequence(map(lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"])), next(instructions)))
     sections: List[ArticleSection] = []
 
     for chunk in more_itertools.chunked(instructions, 2):
         if len(chunk) == 1:
             chunk.append([])
-        texts = [list(map(lambda x: normalize_whitespace(x.text_content(["script"])), c)) for c in chunk]
+        texts = [list(map(lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"])), c)) for c in chunk]
         sections.append(ArticleSection(*map(TextSequence, texts)))
 
     return ArticleBody(summary=summary, sections=sections)

From 108f05a35199955983837932949afc16456925e1 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Fri, 1 Mar 2024 12:53:57 +0100
Subject: [PATCH 6/6] black

---
 src/fundus/parser/utility.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
index 0d387f923..390d9f8cc 100644
--- a/src/fundus/parser/utility.py
+++ b/src/fundus/parser/utility.py
@@ -122,7 +122,9 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]:
         first = next(instructions)
         instructions = itertools.chain([first, []], instructions)
 
-    summary = TextSequence(map(lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"])), next(instructions)))
+    summary = TextSequence(
+        map(lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"])), next(instructions))
+    )
     sections: List[ArticleSection] = []
 
     for chunk in more_itertools.chunked(instructions, 2):