Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GL #705

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -1211,6 +1211,38 @@
</table>


## GL-Publishers

<table class="publishers gl">
<thead>
<tr>
<th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>Name&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>Missing&#160;Attributes</th>
<th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
</tr>
</thead>
<tbody>
<tr>
<td>
<code>Sermitsiaq</code>
</td>
<td>
<div>Sermitsiaq</div>
</td>
<td>
<a href="https://www.sermitsiaq.ag/">
<span>www.sermitsiaq.ag</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
</tbody>
</table>


## IND-Publishers

<table class="publishers ind">
Expand Down
1 change: 1 addition & 0 deletions src/fundus/parser/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,7 @@ def preprocess_url(url: str, domain: str) -> str:

def image_author_parsing(authors: Union[str, List[str]]) -> List[str]:
credit_keywords = [
"fotograf",
"credits?",
"quellen?",
"bild(rechte)?",
Expand Down
2 changes: 2 additions & 0 deletions src/fundus/publishers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from fundus.publishers.dk import DK
from fundus.publishers.es import ES
from fundus.publishers.fr import FR
from fundus.publishers.gl import GL
from fundus.publishers.ind import IND
from fundus.publishers.it import IT
from fundus.publishers.jp import JP
Expand Down Expand Up @@ -64,6 +65,7 @@ class PublisherCollection(metaclass=PublisherCollectionMeta):
us = US
uk = UK
fr = FR
gl = GL
ch = CH
lt = LT
cn = CN
Expand Down
13 changes: 13 additions & 0 deletions src/fundus/publishers/gl/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.scraping.url import Sitemap

from .sermitsiaq import SermitsiaqParser


class GL(metaclass=PublisherGroup):
Sermitsiaq = Publisher(
name="Sermitsiaq",
domain="https://www.sermitsiaq.ag/",
parser=SermitsiaqParser,
sources=[Sitemap("https://www.sermitsiaq.ag/sitemap.xml")],
)
66 changes: 66 additions & 0 deletions src/fundus/publishers/gl/sermitsiaq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import datetime
import re
from typing import List, Optional

from lxml.cssselect import CSSSelector
from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
image_extraction,
)


class SermitsiaqParser(ParserProxy):
class V1(BaseParser):
_paragraph_selector = XPath(
f"//div[contains(@class, 'bodytext')]//p[not(@class='offer-description' or re:test(text(), '^/.*/$'))]",
namespaces={"re": "http://exslt.org/regular-expressions"},
)
_summary_selector = XPath("//h2[@class='subtitle ']")
_subheadline_selector = XPath("//div[contains(@class, 'bodytext')]//h3[not(@class='offer-name')]")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
self.precomputed.doc,
summary_selector=self._summary_selector,
subheadline_selector=self._subheadline_selector,
paragraph_selector=self._paragraph_selector,
)

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"), split_on=["og"])

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.ld.bf_search("headline")

@attribute
def topics(self) -> List[str]:
return [tag.title() for tag in generic_topic_parsing(self.precomputed.meta.get("article:tag"))]

@attribute
def images(self) -> List[Image]:
return image_extraction(
doc=self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
image_selector=XPath("//figure//img[not(@itemprop='image')]"),
caption_selector=XPath(
"./ancestor::*[self::figure or (self::div and contains(@class,'articleHeader'))]"
"//figcaption[@itemprop='caption']"
),
author_selector=XPath(
"./ancestor::*[self::figure or (self::div and contains(@class,'articleHeader'))]"
"//figcaption[@itemprop='author']"
),
)
156 changes: 156 additions & 0 deletions tests/resources/parser/test_data/gl/Sermitsiaq.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
{
"V1": {
"authors": [
"Trine Juncher Jørgensen",
"Tôrtia Reimer-Johansen"
],
"body": {
"summary": [],
"sections": [
{
"headline": [],
"paragraphs": [
"Et stort flertal af den grønlandske befolkning - 85 procent - ønsker ikke, at Grønland træder ud af Rigsfællesskabet og i stedet blive en del af USA.",
"Alligevel svarer knap halvdelen, at de ser Donald Trumps interesse for Grønland som en mulighed, mens den anden halvdel svarer, at de ser det som en trussel.",
"Skal man vælge mellem dansk og amerikansk statsborgerskab, så foretrækker langt størstedelen dansk statsborgerskab (55 procent). En stor del svarer dog ”ved ikke” (37 procent). Kun 8 procent foretrækker amerikansk statsborgerskab."
]
},
{
"headline": [
"Selvstændighed"
],
"paragraphs": [
"I forhold til selvstændighed svarer flertallet, at de ønsker en form for selvstændighed, men inden for en tidshorisont på 10-20 år, og såfremt selvstændighed ikke medfører en nedgang i levevilkår.",
"Det er nogle af resultaterne i den meningsmåling, som analyseinstituttet Verian har udarbejdet for Sermitsiaq og Berlingske. Meningsmålingen er baseret på webinterviews i perioden fra 22-27. januar 2025 med 497 repræsentativt udvalgte borgere i Grønland over 18 år."
]
},
{
"headline": [
"Et grønlandsk anliggende"
],
"paragraphs": [
"Foreholdt meningsmålingen understreger formand for Naalakkersuisut, Múte B. Egede, ”at Grønlands fremtid til enhver tid er et grønlandsk anliggende”.",
"– Og på den baggrund peger denne undersøgelse på, at et stort flertal af grønlænderne ønsker, at Grønland ikke skal underordne sig USA, men at mange er bekymrede over følgende udviklinger, men at mange også ser muligheder i disse udviklinger. Og jeg peger også på, at et flertal af befolkningen ønsker, at der ikke skal være status quo. Og at der skal være tryghed og en beslutning om, at her i landet skal være beslutningskompetence, der vedrører vores fremtid. Selvfølgelig, vi kan se, at denne undersøgelse peger på, at det er op til samfundet at styre, men jeg vil gerne understrege, at det er op til borgerne her at træffe beslutninger for deres fremtid, lyder det."
]
},
{
"headline": [
"Fortsat samarbejde"
],
"paragraphs": [
"I sidste uge kommenterede statsminister Mette Frederiksen også meningsmålingen over for Berlingske. Her lød det, at hun som udgangspunkt ikke ville gå ind i, hvad meningsmålinger fra Grønland måtte vise.",
"– Det til en side er jeg glad for, hvis målingen er udtryk for, at mange grønlændere vil kunne se et fortsat tæt samarbejde med Danmark. Sikkert i en anden udgave end det, vi kender i dag, fordi alting forandrer sig over tid.",
"Efterfølgende har Mette Frederiksen over for Jyllands-Posten i artiklen ”Kamp om Grønland” blandt andet erkendt, at udenrigs- og sikkerhedspolitikken er områder, som udfordres af de rammer som ligger i rigsfællesskabet nu.",
"– Jeg anerkender fuldt ud de ønsker, der er, og vil også gerne løse problemerne, lyder det fra Mette Frederiksen."
]
}
]
},
"images": [
{
"versions": [
{
"url": "https://image.sermitsiaq.ag/2192435.webp?imageId=2192435&x=0.00&y=0.00&cropw=100.00&croph=85.63&width=960&height=548&format=jpg",
"query_width": "max-width:767",
"size": {
"width": 960,
"height": 548
},
"type": "image/jpeg"
},
{
"url": "https://image.sermitsiaq.ag/2192435.webp?imageId=2192435&x=0.00&y=0.00&cropw=100.00&croph=85.63&width=960&height=548&format=webp",
"query_width": "max-width:767",
"size": {
"width": 960,
"height": 548
},
"type": "image/webp"
},
{
"url": "https://image.sermitsiaq.ag/2192435.webp?imageId=2192435&x=0.00&y=0.00&cropw=100.00&croph=85.63&width=2116&height=1208&format=jpg",
"query_width": "min-width:768",
"size": {
"width": 2116,
"height": 1208
},
"type": "image/jpeg"
},
{
"url": "https://image.sermitsiaq.ag/2192435.webp?imageId=2192435&x=0.00&y=0.00&cropw=100.00&croph=85.63&width=2116&height=1208&format=webp",
"query_width": "min-width:768",
"size": {
"width": 2116,
"height": 1208
},
"type": "image/webp"
}
],
"is_cover": true,
"description": null,
"caption": "Formand for Naalakkersuisut, Múte B. Egede har noteret sig meningsmålingens resultater om, at et stort flertal af grønlænderne ikke ønsker, at Grønland skal underordne sig USA. Han noterer sig også, at status quo ikke er en mulighed.",
"authors": [
"Mads Claus Rasmussen"
],
"position": 272
},
{
"versions": [
{
"url": "https://image.sermitsiaq.ag/2192439.webp?imageId=2192439&width=960&height=548&format=jpg",
"query_width": "max-width:767",
"size": {
"width": 960,
"height": 548
},
"type": "image/jpeg"
},
{
"url": "https://image.sermitsiaq.ag/2192439.webp?imageId=2192439&width=960&height=548&format=webp",
"query_width": "max-width:767",
"size": {
"width": 960,
"height": 548
},
"type": "image/webp"
},
{
"url": "https://image.sermitsiaq.ag/2192439.webp?imageId=2192439&width=1412&height=806&format=jpg",
"query_width": "min-width:768",
"size": {
"width": 1412,
"height": 806
},
"type": "image/jpeg"
},
{
"url": "https://image.sermitsiaq.ag/2192439.webp?imageId=2192439&width=1412&height=806&format=webp",
"query_width": "min-width:768",
"size": {
"width": 1412,
"height": 806
},
"type": "image/webp"
}
],
"is_cover": false,
"description": null,
"caption": "Statsminister Mette Frederiksen er ifølge Berlingske ”glad for, hvis målingen er et udtryk for, at mange grønlændere vil kunne se et fortsat tæt samarbejde med Danmark. Sikkert i en anden udgave end det, vi kender i dag, fordi alting forandrer sig over tid.”",
"authors": [
"Mads Claus Rasmussen/Ritzau Scanpix"
],
"position": 310
}
],
"publishing_date": "2025-02-06 19:25:34+00:00",
"title": "Múte B. Egede om meningsmåling: Status quo er ikke en mulighed",
"topics": [
"Múte B. Egede",
"Grønland",
"Selvstændighed",
"Samfund",
"Rigsfællesskabet",
"Donald Trump"
]
}
}
Binary file not shown.
6 changes: 6 additions & 0 deletions tests/resources/parser/test_data/gl/meta.info
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"Sermitsiaq_2025_02_12.html.gz": {
"url": "https://www.sermitsiaq.ag/samfund/mute-b-egede-om-meningsmaling-status-quo-er-ikke-en-mulighed/2192432",
"crawl_date": "2025-02-12 00:45:36.535174"
}
}
Loading