flairNLP · addie9800 · Feb 12, 2024 · Jan 29, 2024 · Jan 29, 2024 · Feb 5, 2024
diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md
@@ -8,7 +8,7 @@
     <tr>
       <th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
       <th>Source&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
-      <th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
       <th>Missing&#160;Attributes</th>
       <th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
     </tr>
@@ -42,7 +42,7 @@
     <tr>
       <th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
       <th>Source&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
-      <th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
       <th>Missing&#160;Attributes</th>
       <th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
     </tr>
@@ -78,6 +78,23 @@
       <td>&#160;</td>
       <td>&#160;</td>
     </tr>
+    <tr>
+      <td>
+        <code>BSZ</code>
+      </td>
+      <td>
+        <div>Braunschweiger Zeitung</div>
+      </td>
+      <td>
+        <a href="https://www.braunschweiger-zeitung.de/">
+          <span>www.braunschweiger-zeitung.de</span>
+        </a>
+      </td>
+      <td>&#160;</td>
+      <td>
+        <code>free_access</code>
+      </td>
+    </tr>
     <tr>
       <td>
         <code>DW</code>
@@ -316,7 +333,7 @@
     <tr>
       <th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
       <th>Source&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
-      <th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
       <th>Missing&#160;Attributes</th>
       <th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
     </tr>
@@ -348,7 +365,7 @@
     <tr>
       <th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
       <th>Source&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
-      <th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
       <th>Missing&#160;Attributes</th>
       <th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
     </tr>
@@ -382,7 +399,7 @@
     <tr>
       <th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
       <th>Source&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
-      <th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
       <th>Missing&#160;Attributes</th>
       <th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
     </tr>
@@ -459,7 +476,7 @@
     <tr>
       <th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
       <th>Source&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
-      <th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
       <th>Missing&#160;Attributes</th>
       <th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
     </tr>

diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py
@@ -6,6 +6,7 @@
 
 from .berliner_zeitung import BerlinerZeitungParser
 from .bild import BildParser
+from .braunschweiger_zeitung import BSZeitungParser
 from .die_welt import DieWeltParser
 from .die_zeit import DieZeitParser
 from .dw import DWParser
@@ -173,7 +174,10 @@ class DE(PublisherEnum):
     Taz = PublisherSpec(
         name="Die Tageszeitung (taz)",
         domain="https://www.taz.de/",
-        sources=[NewsMap("https://taz.de/sitemap-google-news.xml"), Sitemap("https://taz.de/sitemap-index.xml")],
+        sources=[
+            NewsMap("https://taz.de/sitemap-google-news.xml"),
+            Sitemap("https://taz.de/sitemap-index.xml", reverse=True),
+        ],
         parser=TazParser,
     )
 
@@ -190,3 +194,13 @@ class DE(PublisherEnum):
         sources=[NewsMap("https://www.waz.de/sitemaps/news.xml")],
         parser=WAZParser,
     )
+
+    BSZ = PublisherSpec(
+        name="Braunschweiger Zeitung",
+        domain="https://www.braunschweiger-zeitung.de/",
+        sources=[
+            RSSFeed("https://www.braunschweiger-zeitung.de/rss"),
+            Sitemap("https://www.braunschweiger-zeitung.de/sitemaps/news.xml"),
+        ],
+        parser=BSZeitungParser,
+    )
diff --git a/src/fundus/publishers/de/braunschweiger_zeitung.py b/src/fundus/publishers/de/braunschweiger_zeitung.py
@@ -0,0 +1,68 @@
+import datetime
+import re
+from typing import List, Optional, Pattern
+
+from lxml.etree import XPath
+
+from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
+from fundus.parser.utility import (
+    apply_substitution_pattern_over_list,
+    extract_article_body_with_selector,
+    generic_author_parsing,
+    generic_date_parsing,
+    generic_topic_parsing,
+)
+
+
+class BSZeitungParser(ParserProxy):
+    class V1(BaseParser):
+        _author_substitution_pattern: Pattern[str] = re.compile(r"FUNKE Mediengruppe")
+        _paragraph_selector = XPath(
+            "//div[@class='article-body']//p[not(contains(strong, 'Meistgeklickte Nachrichten "
+            "aus der Region') or contains(strong, 'Keine wichtigen News mehr verpassen') or "
+            "@rel='author' or em[@class='print'] or contains(a, 'Jetzt Angebot und Vorteile "
+            "checken') or contains(text(), 'Lesen Sie mehr Geschichten aus')  or contains("
+            "strong, 'Mehr wichtige Nachrichten aus') or contains(strong, 'Täglich wissen, "
+            "was in') or contains(strong, 'Auch interessant') or contains(strong, 'Das könnte "
+            "Sie auch interessieren') or contains(strong, 'Lesen Sie auch') or contains("
+            "strong, 'Mehr zu dem Thema') or contains(strong, 'Mehr zum Thema') or contains("
+            "strong, 'Lesen Sie dazu') or contains(strong, 'Lesen Sie hier'))]"
+        )
+        _summary_selector = XPath("//div[@class='article-body']//p[1]")
+        _subheadline_selector = XPath(
+            "//div[@class='article-body']//h3[not(contains(text(), 'Alle Artikel der "
+            "Serie') or contains(text(), 'Mehr zum Thema') or contains(text(), "
+            "'weitere Videos') or contains(text(), 'Auch interessant') or contains(text(), "
+            "'Weitere News'))]"
+        )
+
+        @attribute
+        def body(self) -> ArticleBody:
+            return extract_article_body_with_selector(
+                self.precomputed.doc,
+                summary_selector=self._summary_selector,
+                subheadline_selector=self._subheadline_selector,
+                paragraph_selector=self._paragraph_selector,
+            )
+
+        @attribute
+        def title(self) -> Optional[str]:
+            return self.precomputed.ld.bf_search("headline")
+
+        @attribute
+        def topics(self) -> List[str]:
+            return generic_topic_parsing(self.precomputed.ld.bf_search("keywords"))
+
+        @attribute
+        def authors(self) -> List[str]:
+            return apply_substitution_pattern_over_list(
+                generic_author_parsing(self.precomputed.ld.bf_search("author")), self._author_substitution_pattern
+            )
+
+        @attribute
+        def publishing_date(self) -> Optional[datetime.datetime]:
+            return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
+
+        @attribute(validate=False)
+        def free_access(self) -> bool:
+            return self.precomputed.ld.bf_search("isAccessibleForFree") == "True"
diff --git a/tests/resources/parser/test_data/de/BSZ.json b/tests/resources/parser/test_data/de/BSZ.json
@@ -0,0 +1,12 @@
+{
+  "V1": {
+    "authors": [
+      "Stefan Lienert"
+    ],
+    "publishing_date": "2024-01-29 18:09:51+00:00",
+    "title": "Heide-Park: Eröffnungstermin für neue Attraktion steht fest",
+    "topics": [
+      "Heide-Park Soltau Soltau Freizeitpark"
+    ]
+  }
+}
diff --git a/tests/resources/parser/test_data/de/BSZ_2024_01_29.html.gz b/tests/resources/parser/test_data/de/BSZ_2024_01_29.html.gz
diff --git a/tests/resources/parser/test_data/de/meta.info b/tests/resources/parser/test_data/de/meta.info
@@ -1,4 +1,8 @@
 {
+  "BSZ_2024_01_29.html.gz": {
+    "url": "https://www.braunschweiger-zeitung.de/niedersachsen/article241536118/Heide-Park-Eroeffnungstermin-fuer-neue-Attraktion-steht-fest.html",
+    "crawl_date": "2024-01-29 19:29:19.952428"
+  },
   "BerlinerZeitung_2023_04_28.html.gz": {
     "url": "https://www.berliner-zeitung.de/news/550-kinder-gezeugt-gericht-stoppt-uebereifrigen-samenspender-in-den-niederlanden-li.343191",
     "crawl_date": "2023-04-28 20:25:16.328923"