Merge pull request #675 from flairNLP/add-el-mundp

Add `El Mundo`
flairNLP · Jan 3, 2025 · 2c12805 · 2c12805
2 parents 3a6d151 + b7c583b
commit 2c12805
Show file tree

Hide file tree

Showing 6 changed files with 399 additions and 0 deletions.
diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md
@@ -999,6 +999,21 @@
     </tr>
   </thead>
   <tbody>
+    <tr>
+      <td>
+        <code>ElMundo</code>
+      </td>
+      <td>
+        <div>El Mundo</div>
+      </td>
+      <td>
+        <a href="https://www.elmundo.es/">
+          <span>www.elmundo.es</span>
+        </a>
+      </td>
+      <td>&#160;</td>
+      <td>&#160;</td>
+    </tr>
     <tr>
       <td>
         <code>ElPais</code>

diff --git a/src/fundus/publishers/es/__init__.py b/src/fundus/publishers/es/__init__.py
@@ -3,6 +3,7 @@
 from dateutil.rrule import MONTHLY, rrule
 
 from fundus.publishers.base_objects import Publisher, PublisherGroup
+from fundus.publishers.es.el_mundo import ElMundoParser
 from fundus.publishers.es.el_pais import ElPaisParser
 from fundus.scraping.url import RSSFeed, Sitemap
 
@@ -20,3 +21,12 @@ class ES(metaclass=PublisherGroup):
             )
         ],
     )
+    ElMundo = Publisher(
+        name="El Mundo",
+        domain="https://www.elmundo.es/",
+        parser=ElMundoParser,
+        sources=[
+            RSSFeed("https://e00-elmundo.uecdn.es/elmundo/rss/portada.xml"),
+            RSSFeed("https://e00-elmundo.uecdn.es/elmundo/rss/espana.xml"),
+        ],
+    )
diff --git a/src/fundus/publishers/es/el_mundo.py b/src/fundus/publishers/es/el_mundo.py
@@ -0,0 +1,55 @@
+import datetime
+from typing import List, Optional
+
+from lxml.etree import XPath
+
+from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
+from fundus.parser.utility import (
+    extract_article_body_with_selector,
+    generic_author_parsing,
+    generic_date_parsing,
+    generic_topic_parsing,
+    image_extraction,
+)
+
+
+class ElMundoParser(ParserProxy):
+    class V1(BaseParser):
+        _paragraph_selector = XPath("//div[@class='ue-c-article__body']/p")
+        _subheadline_selector = XPath("//div[@class='ue-c-article__body']/h2")
+        _summary_selector = XPath("//div[@class='ue-c-article__standfirst']//p")
+
+        @attribute
+        def body(self) -> Optional[ArticleBody]:
+            return extract_article_body_with_selector(
+                self.precomputed.doc,
+                paragraph_selector=self._paragraph_selector,
+                subheadline_selector=self._subheadline_selector,
+                summary_selector=self._summary_selector,
+            )
+
+        @attribute
+        def publishing_date(self) -> Optional[datetime.datetime]:
+            return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
+
+        @attribute
+        def title(self) -> Optional[str]:
+            return self.precomputed.ld.bf_search("headline")
+
+        @attribute
+        def authors(self) -> List[str]:
+            return generic_author_parsing(self.precomputed.ld.bf_search("author"))
+
+        @attribute
+        def topics(self) -> List[str]:
+            return [topic.split("/")[-1] for topic in generic_topic_parsing(self.precomputed.meta.get("keywords"))]
+
+        @attribute
+        def images(self) -> List[Image]:
+            return image_extraction(
+                doc=self.precomputed.doc,
+                paragraph_selector=self._paragraph_selector,
+                image_selector=XPath("//figure//img[contains(@class, 'article__image')]"),
+                caption_selector=XPath("./ancestor::figure//figcaption//span[contains(@class, 'description')]"),
+                author_selector=XPath("./ancestor::figure//figcaption//span[contains(@class, 'source')]/span"),
+            )