Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add functionality to exclude tags from extraction and normalize space #382

Merged
merged 6 commits into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions src/fundus/parser/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,12 +199,8 @@ def as_text_sequence(self) -> TextSequence:
texts = [text for tl in self.df_traversal() for text in tl]
return TextSequence(texts)

def text(self, join_on: str = "\n\n", strip_text: bool = True) -> str:
if strip_text:
striped_texts = [" ".join(text.split()) for text in self.as_text_sequence()]
return join_on.join(striped_texts)
else:
return join_on.join(self.as_text_sequence())
def text(self, join_on: str = "\n\n") -> str:
return join_on.join(self.as_text_sequence())

def df_traversal(self) -> Iterable[TextSequence]:
def recursion(o: object):
Expand Down
30 changes: 23 additions & 7 deletions src/fundus/parser/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
cast,
)

import dateutil.tz
import lxml.html
import more_itertools
from dateutil import parser
Expand All @@ -27,15 +26,32 @@
from fundus.parser.data import ArticleBody, ArticleSection, TextSequence


def normalize_whitespace(text: str) -> str:
return " ".join(text.split())


@total_ordering
@dataclass(eq=False)
class Node:
position: int
node: lxml.html.HtmlElement = field(compare=False)
_break_selector: ClassVar[XPath] = XPath("*//br")

def striped(self, chars: Optional[str] = None) -> str:
return str(self).strip(chars)
# one could replace this recursion with XPath using an expression like this:
# //*[not(self::script) and text()]/text(), but for whatever reason, that's actually 50-150% slower
# than simply using the implemented mixture below
def text_content(self, excluded_tags: Optional[List[str]] = None) -> str:
guarded_excluded_tags: List[str] = excluded_tags or []

def _text_content(element: lxml.html.HtmlElement) -> str:
if element.tag in guarded_excluded_tags:
return ""
text = element.text or "" if not isinstance(element, lxml.html.HtmlComment) else ""
children = "".join([_text_content(child) for child in element.iterchildren()]) or ""
tail = element.tail or ""
return text + children + tail
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return text + children + tail
return f"{text}{children}{tail}"

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer the plus concatenation over f-strings, it's just personal preference here.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, it's just to avoid the creation of an immediate string text + children with the f-string.


return _text_content(self._get_break_preserved_node())

def _get_break_preserved_node(self) -> lxml.html.HtmlElement:
copied_node = copy(self.node)
Expand All @@ -55,10 +71,10 @@ def __hash__(self) -> int:
return hash(self.position)

def __str__(self) -> str:
return self._get_break_preserved_node().text_content()
return self.text_content()

def __bool__(self):
return bool(self.striped())
return bool(normalize_whitespace(self.text_content()))


class SummaryNode(Node):
Expand Down Expand Up @@ -106,13 +122,13 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]:
first = next(instructions)
instructions = itertools.chain([first, []], instructions)

summary = TextSequence(map(lambda x: x.striped("\n"), next(instructions)))
summary = TextSequence(map(lambda x: normalize_whitespace(x.text_content(["script"])), next(instructions)))
sections: List[ArticleSection] = []

for chunk in more_itertools.chunked(instructions, 2):
if len(chunk) == 1:
chunk.append([])
texts = [list(map(lambda x: x.striped("\n"), c)) for c in chunk]
texts = [list(map(lambda x: normalize_whitespace(x.text_content(["script"])), c)) for c in chunk]
sections.append(ArticleSection(*map(TextSequence, texts)))

return ArticleBody(summary=summary, sections=sections)
Expand Down
Loading