Skip to content

Commit

Permalink
Merge pull request #675 from flairNLP/add-el-mundp
Browse files Browse the repository at this point in the history
Add `El Mundo`
  • Loading branch information
MaxDall authored Jan 3, 2025
2 parents 3a6d151 + b7c583b commit 2c12805
Show file tree
Hide file tree
Showing 6 changed files with 399 additions and 0 deletions.
15 changes: 15 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -999,6 +999,21 @@
</tr>
</thead>
<tbody>
<tr>
<td>
<code>ElMundo</code>
</td>
<td>
<div>El Mundo</div>
</td>
<td>
<a href="https://www.elmundo.es/">
<span>www.elmundo.es</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>ElPais</code>
Expand Down
10 changes: 10 additions & 0 deletions src/fundus/publishers/es/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from dateutil.rrule import MONTHLY, rrule

from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.es.el_mundo import ElMundoParser
from fundus.publishers.es.el_pais import ElPaisParser
from fundus.scraping.url import RSSFeed, Sitemap

Expand All @@ -20,3 +21,12 @@ class ES(metaclass=PublisherGroup):
)
],
)
ElMundo = Publisher(
name="El Mundo",
domain="https://www.elmundo.es/",
parser=ElMundoParser,
sources=[
RSSFeed("https://e00-elmundo.uecdn.es/elmundo/rss/portada.xml"),
RSSFeed("https://e00-elmundo.uecdn.es/elmundo/rss/espana.xml"),
],
)
55 changes: 55 additions & 0 deletions src/fundus/publishers/es/el_mundo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import datetime
from typing import List, Optional

from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
image_extraction,
)


class ElMundoParser(ParserProxy):
class V1(BaseParser):
_paragraph_selector = XPath("//div[@class='ue-c-article__body']/p")
_subheadline_selector = XPath("//div[@class='ue-c-article__body']/h2")
_summary_selector = XPath("//div[@class='ue-c-article__standfirst']//p")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
subheadline_selector=self._subheadline_selector,
summary_selector=self._summary_selector,
)

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.ld.bf_search("headline")

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"))

@attribute
def topics(self) -> List[str]:
return [topic.split("/")[-1] for topic in generic_topic_parsing(self.precomputed.meta.get("keywords"))]

@attribute
def images(self) -> List[Image]:
return image_extraction(
doc=self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
image_selector=XPath("//figure//img[contains(@class, 'article__image')]"),
caption_selector=XPath("./ancestor::figure//figcaption//span[contains(@class, 'description')]"),
author_selector=XPath("./ancestor::figure//figcaption//span[contains(@class, 'source')]/span"),
)
Loading

0 comments on commit 2c12805

Please sign in to comment.