Skip to content

Commit

Permalink
update paragraph selector
Browse files Browse the repository at this point in the history
  • Loading branch information
addie9800 committed Feb 23, 2024
1 parent 3179e72 commit fe4a663
Showing 1 changed file with 14 additions and 2 deletions.
16 changes: 14 additions & 2 deletions src/fundus/publishers/de/mdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import List, Optional, Pattern

from lxml.cssselect import CSSSelector
from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
Expand All @@ -17,7 +18,15 @@
class MDRParser(ParserProxy):
class V1(BaseParser):
_author_substitution_pattern: Pattern[str] = re.compile(r"MDR \w*$|MDR \w*-\w*$|MDRfragt-Redaktionsteam|^von")
_paragraph_selector = CSSSelector("div.paragraph")
_source_detection: str = (
r"((MDR AKTUELL \(ans\))|(Quell(e|en): [A-z\.]{3,4})|(\([A-z]{1,4}\))|([A-z]{3}/[A-z]{3}))"
)
_paragraph_selector = XPath(
f"//div[@class='paragraph '"
f" and not(div[@class='mediaCon mediaLeft mediaSizeA cssImage hasNoRessort item-delegated-lightbox'])"
f" ]//p[not(re:test(em, '{_source_detection}') or re:test(text(), '{_source_detection}'))]",
namespaces={"re": "http://exslt.org/regular-expressions"},
)
_summary_selector = CSSSelector("p.einleitung")
_subheadline_selector = CSSSelector("div > .subtitle")
_author_selector = CSSSelector(".articleMeta > .author")
Expand All @@ -33,7 +42,10 @@ def body(self) -> ArticleBody:

@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(self.precomputed.meta.get("news_keywords"))
if self.precomputed.meta.get("news_keywords") is not None:
return generic_topic_parsing(self.precomputed.meta.get("news_keywords"))
else:
return generic_topic_parsing(self.precomputed.meta.get("keywords"))

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
Expand Down

0 comments on commit fe4a663

Please sign in to comment.