Skip to content

Commit

Permalink
Update documentation and initialization for Italian newspapers
Browse files Browse the repository at this point in the history
  • Loading branch information
ruggsea committed Feb 4, 2025
1 parent d8809b1 commit 005b328
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 0 deletions.
30 changes: 30 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -1138,6 +1138,36 @@
</tr>
</thead>
<tbody>
<tr>
<td>
<code>CorriereDellaSera</code>
</td>
<td>
<div>Corriere Della Sera</div>
</td>
<td>
<a href="https://www.corriere.it">
<span>www.corriere.it</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>IlGiornale</code>
</td>
<td>
<div>Il Giornale</div>
</td>
<td>
<a href="https://www.ilgiornale.it">
<span>www.ilgiornale.it</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>LaRepubblica</code>
Expand Down
89 changes: 89 additions & 0 deletions src/fundus/publishers/it/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from dateutil.rrule import MONTHLY, rrule

from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.it.corriere_della_sera import CorriereDellaSeraParser
from fundus.publishers.it.il_giornale import IlGiornaleParser
from fundus.publishers.it.la_repubblica import LaRepubblicaParser
from fundus.scraping.url import RSSFeed, Sitemap

Expand All @@ -22,3 +24,90 @@ class IT(metaclass=PublisherGroup):
)
],
)

CorriereDellaSera = Publisher(
name="Corriere Della Sera",
domain="https://www.corriere.it",
parser=CorriereDellaSeraParser,
sources=[
# Main RSS feeds
RSSFeed("https://www.corriere.it/feed-hp/homepage.xml"),
RSSFeed("https://www.corriere.it/rss/ultimora.xml"),
RSSFeed("https://www.corriere.it/dynamic-feed/rss/section/Dataroom.xml"),
RSSFeed("https://www.corriere.it/dynamic-feed/rss/section/lettere-al-direttore.xml"),
RSSFeed("https://www.corriere.it/dynamic-feed/rss/section/lo-dico-al-corriere.xml"),
RSSFeed("https://www.corriere.it/dynamic-feed/rss/section/frammenti-di-ferruccio-de-bortoli.xml"),
# Main sitemaps
Sitemap("https://www.corriere.it/rss/sitemap_v2.xml"),
Sitemap("https://www.corriere.it/salute/sitemap-dizionario-corriere-salute.xml"),
# Dynamic sitemaps - Last 100 articles
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/video/Corriere.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Economia.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Salute.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Scienze.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Interni.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Esteri.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Sport.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Politica.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Salute__Figli__e__Genitori.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Salute__Sportello__Cancro.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Elezioni.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Tecnologia.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Offerte__recensioni.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Lotterie.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Spettacoli.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Scuola.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Animali.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Opinioni.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Caffe-gramellini.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Ultimo-banco.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Letti-da-rifarei.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Piccole-dosi.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/L-angolo.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Padiglione-italia.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Facce-nuove.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Ritorno-in-solferino.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Oriente-occidente.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Sette.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Moda.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/BuoneNotizie.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/lettere__al__direttore.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/lo__dico__al__corriere.xml"),
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Frammenti-ferruccio-de-bortoli.xml"),
# Section sitemaps
Sitemap("https://www.corriere.it/rss/sitemap/Motori.xml"),
Sitemap("https://www.corriere.it/rss/sitemap/Cultura.xml"),
Sitemap("https://vivimilano.corriere.it/sitemap_index.xml"),
Sitemap("https://www.corriere.it/cook/sitemap-index.xml"),
Sitemap("https://www.corriere.it/oroscopo/sitemap.xml"),
Sitemap("https://www.corriere.it/elezioni/sitemap/sitemap.xml"),
Sitemap("https://www.corriere.it/sport/risultati-live/sitemap.xml"),
Sitemap("https://www.corriere.it/salute/il-medico-risponde/sitemap.xml"),
Sitemap("https://www.corriere.it/rss/sitemap/lettere-al-direttore.xml"),
Sitemap("https://www.corriere.it/rss/sitemap/lo-dico-al-corriere.xml"),
Sitemap("https://www.corriere.it/rss/sitemap/Cook-Last.xml"),
Sitemap("https://www.corriere.it/economia/chiedi-esperto/sitemap.xml"),
Sitemap("https://www.corriere.it/economia/chiedi-esperto/news/sitemap.xml"),
Sitemap("https://www.corriere.it/studio/sitemap-studio.xml"),
],
)

IlGiornale = Publisher(
name="Il Giornale",
domain="https://www.ilgiornale.it",
parser=IlGiornaleParser,
request_header={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
},
sources=[
RSSFeed("https://www.ilgiornale.it/feed.xml"),
RSSFeed("https://www.ilgiornale.it/feed/rss.xml"),
Sitemap("https://www.ilgiornale.it/sitemap/google-news.xml"),
Sitemap("https://www.ilgiornale.it/sitemap/indice.xml"),
],
)

0 comments on commit 005b328

Please sign in to comment.