From 0f685835356ae141693274ee8419617e9b9a4a2c Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Mon, 29 Jan 2024 19:33:38 +0100 Subject: [PATCH 1/6] Addition of Braunschweiger Zeitung --- docs/supported_publishers.md | 29 ++++++-- src/fundus/publishers/de/__init__.py | 16 +++- .../publishers/de/braunschweiger_zeitung.py | 69 ++++++++++++++++++ tests/resources/parser/test_data/de/BSZ.json | 12 +++ .../test_data/de/BSZ_2024_01_29.html.gz | Bin 0 -> 30831 bytes tests/resources/parser/test_data/de/meta.info | 4 + 6 files changed, 123 insertions(+), 7 deletions(-) create mode 100644 src/fundus/publishers/de/braunschweiger_zeitung.py create mode 100644 tests/resources/parser/test_data/de/BSZ.json create mode 100644 tests/resources/parser/test_data/de/BSZ_2024_01_29.html.gz diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md index 85b87a4f9..8c59c04d0 100644 --- a/docs/supported_publishers.md +++ b/docs/supported_publishers.md @@ -8,7 +8,7 @@ Class                       Source                                                               - URL                                           + URL                                                     Missing Attributes Additional Attributes     @@ -42,7 +42,7 @@ Class                       Source                                                               - URL                                           + URL                                                     Missing Attributes Additional Attributes     @@ -78,6 +78,23 @@     + + + BSZ + + +
Braunschweiger Zeitung
+ + + + www.braunschweiger-zeitung.de + + +   + + free_access + + DW @@ -316,7 +333,7 @@ Class                       Source                                                               - URL                                           + URL                                                     Missing Attributes Additional Attributes     @@ -348,7 +365,7 @@ Class                       Source                                                               - URL                                           + URL                                                     Missing Attributes Additional Attributes     @@ -382,7 +399,7 @@ Class                       Source                                                               - URL                                           + URL                                                     Missing Attributes Additional Attributes     @@ -459,7 +476,7 @@ Class                       Source                                                               - URL                                           + URL                                                     Missing Attributes Additional Attributes     diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py index 14192bde1..3f6cdd89c 100644 --- a/src/fundus/publishers/de/__init__.py +++ b/src/fundus/publishers/de/__init__.py @@ -6,6 +6,7 @@ from .berliner_zeitung import BerlinerZeitungParser from .bild import BildParser +from .braunschweiger_zeitung import BSZeitungParser from .die_welt import DieWeltParser from .die_zeit import DieZeitParser from .dw import DWParser @@ -173,7 +174,10 @@ class DE(PublisherEnum): Taz = PublisherSpec( name="Die Tageszeitung (taz)", domain="https://www.taz.de/", - sources=[NewsMap("https://taz.de/sitemap-google-news.xml"), Sitemap("https://taz.de/sitemap-index.xml")], + sources=[ + NewsMap("https://taz.de/sitemap-google-news.xml"), + Sitemap("https://taz.de/sitemap-index.xml", reverse=True), + ], parser=TazParser, ) @@ -190,3 +194,13 @@ class DE(PublisherEnum): sources=[NewsMap("https://www.waz.de/sitemaps/news.xml")], parser=WAZParser, ) + + BSZ = PublisherSpec( + name="Braunschweiger Zeitung", + domain="https://www.braunschweiger-zeitung.de/", + sources=[ + RSSFeed("https://www.braunschweiger-zeitung.de/rss"), + Sitemap("https://www.braunschweiger-zeitung.de/sitemaps/news.xml"), + ], + parser=BSZeitungParser, + ) diff --git a/src/fundus/publishers/de/braunschweiger_zeitung.py b/src/fundus/publishers/de/braunschweiger_zeitung.py new file mode 100644 index 000000000..51e3d3929 --- /dev/null +++ b/src/fundus/publishers/de/braunschweiger_zeitung.py @@ -0,0 +1,69 @@ +import datetime +import re +from typing import List, Optional + +from lxml.cssselect import CSSSelector +from lxml.etree import XPath + +from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.utility import ( + apply_substitution_pattern_over_list, + extract_article_body_with_selector, + generic_author_parsing, + generic_date_parsing, + generic_topic_parsing, +) + + +class BSZeitungParser(ParserProxy): + class V1(BaseParser): + _author_substitution_pattern: re.Pattern[str] = re.compile(r"FUNKE Mediengruppe") + _paragraph_selector = XPath( + "//div[@class='article-body']//p[not(contains(strong, 'Meistgeklickte Nachrichten " + "aus der Region') or contains(strong, 'Keine wichtigen News mehr verpassen') or " + "@rel='author' or em[@class='print'] or contains(a, 'Jetzt Angebot und Vorteile " + "checken') or contains(text(), 'Lesen Sie mehr Geschichten aus') or contains(" + "strong, 'Mehr wichtige Nachrichten aus') or contains(strong, 'Täglich wissen, " + "was in') or contains(strong, 'Auch interessant') or contains(strong, 'Das könnte " + "Sie auch interessieren') or contains(strong, 'Lesen Sie auch') or contains(" + "strong, 'Mehr zu dem Thema') or contains(strong, 'Mehr zum Thema') or contains(" + "strong, 'Lesen Sie dazu') or contains(strong, 'Lesen Sie hier'))]" + ) + _summary_selector = XPath("//div[@class='article-body']//p[1]") + _subheadline_selector = XPath( + "//div[@class='article-body']//h3[not(contains(text(), 'Alle Artikel der " + "Serie') or contains(text(), 'Mehr zum Thema') or contains(text(), " + "'weitere Videos') or contains(text(), 'Auch interessant') or contains(text(), " + "'Weitere News'))]" + ) + + @attribute + def body(self) -> ArticleBody: + return extract_article_body_with_selector( + self.precomputed.doc, + summary_selector=self._summary_selector, + subheadline_selector=self._subheadline_selector, + paragraph_selector=self._paragraph_selector, + ) + + @attribute + def title(self) -> Optional[str]: + return self.precomputed.ld.bf_search("headline") + + @attribute + def topics(self) -> List[str]: + return generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) + + @attribute + def authors(self) -> List[str]: + return apply_substitution_pattern_over_list( + generic_author_parsing(self.precomputed.ld.bf_search("author")), self._author_substitution_pattern + ) + + @attribute + def publishing_date(self) -> Optional[datetime.datetime]: + return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) + + @attribute(validate=False) + def free_access(self) -> bool: + return self.precomputed.ld.bf_search("isAccessibleForFree") == "True" diff --git a/tests/resources/parser/test_data/de/BSZ.json b/tests/resources/parser/test_data/de/BSZ.json new file mode 100644 index 000000000..677a87185 --- /dev/null +++ b/tests/resources/parser/test_data/de/BSZ.json @@ -0,0 +1,12 @@ +{ + "V1": { + "authors": [ + "Stefan Lienert" + ], + "publishing_date": "2024-01-29 18:09:51+00:00", + "title": "Heide-Park: Eröffnungstermin für neue Attraktion steht fest", + "topics": [ + "Heide-Park Soltau Soltau Freizeitpark" + ] + } +} diff --git a/tests/resources/parser/test_data/de/BSZ_2024_01_29.html.gz b/tests/resources/parser/test_data/de/BSZ_2024_01_29.html.gz new file mode 100644 index 0000000000000000000000000000000000000000..d7db2a20f04d445160ba81d889a8139cd571814e GIT binary patch literal 30831 zcmV)0K+eA(iwFRJ?zd$E|Lnc#a@$CJ--?>5W@GJA3aC-Nq%WsO8jL9_axaBtiPaQU%IdQ_% zyRAh!(FUy@B`I_Qez$ex@#Q><(-s>?Vah|uy7b(1cGu-c-k59jV~csAmwJw`C1c0u zca1F;z?)zZC@-92K#yyXtIgitm zyRB&Y(VMA_$K}a5_U4#VY35-Jr4sfXpK&h;y@V|S_P_u9zkhX7=P(LJJk<81Mf|`2 z{BJhp{`=o1DNoz%ffI)8@cqBTkcaFZPZr}DkD2S0dc&dS!0{92ED{*$$m1~&+w7;% zWnna)G46$&L96fo1v&W)`oo@tlZbflmR@7 zxp&IFbPnjdwHNuRv)C#Snu#|JU9F2Olvr_x`TmUd^<4udav6x2D*80WFJ@u16^;woAYUFr_^{E**;b43W4^$@Q}sa zzuOu+VHA46J6dcO^GVVAa=C1eVrLO1wj3ZtmlG?|iWQ&{k<^e#wvpYoSZ5`jS;T$=v_5lx>!|81 z(peNMJ>5(B#0l9`m_ZmzLy2)TKZ(8ROce?q{q+2=57{%C_i4PC&v`>Z9yp!~6Ax&^ zUseL6X<0=mJt#G}{NTaECaMC&DN2)4TP{D|VjxWT(Q*sOl@pI=RZ(I^S&&^c!hUTY zbCg@QeJ7rBr52KJv?G|ED9@uA<&Kq*W93Lz@%NLtnp> zz{HOn=#Z1ZR0u*M*6|-(jD9Eam;}*}LRF03!SJ95WM#=ma|T~v#y?)ghMCoj-#%{D zd8hH)GJ_3pFM!)ET;(KiZ?j=fpC8|5LmU2VF*OCsVM@=S6hqgKmN}KG&I!zJ$j51w zrmhc;MXw-R$Fb&xM+yixZxXXH~o$0sf)-7oW5NbAdyG zFdD-8h}$7g8xeEEiW5Fw#C*OOId0Mh-V-fAc^v!R`0%1NkKM36_W&AgFS<}Y=0~Y! zv^Z&}UU)Khk_2EFTnK1LA=+a5@s_R$oi`E*sC8&?OS zyrYdRYiKwYA6ctKu-1oF?N|eD>XrlY{70-h2Q{rxfHeo?pD&Ub&Ru6b1D%cg9bC>l z?)+$n|M)IEIe8vDebt>tcDMa5X|1Br>socDAy3X6#*K<8sVsgZUP*QO%e}pW*P@1X zGE2;zv?Pnx%H3dPDWj?}sDkZ$4lCFpF&N({)!r+Ojb$yAh54cu4yxS9gFnETHF%*_ ztt}U=Z|uySvbpMnCz}^X^jIX?$cZ({F%_|L;w=2Mv93uJ2TrQFJOvq`Y(hx6&*!r! z>TW96i3U1$D)e%aZMSMxHSXW1={gTc=bn%{%z31eHiBb za_K)#Ca$gPXO~T#xi$ku$*(9&Dol$d3B1fB>)VG&O`<5Qc$~(1-QF1A3;s(YWoM4~ zDFsVoXdc7oe5$qVo$sX3a4sS^N7z(*F0ubcmD&mXO{2wlrio5veHiG+@cDqj&zDX| zRErgAP)XMOW)1iH$ME?G4fp!zYq$$g@0uT>;jVeUhD`vq`7uCko(XEGJ5CpKt)%mP zP>adjcaoWA>h?f0`<5Is>Lm`UMxcI61;~I}Q3G;9m(lwf$MpvZR6UL`=)`6nH&(d)tF zT7%WY9at+yYYgo#ZEj(lG~!P@*v~n&rfGk`XbtmAn_3v>hfeK92K$+&wrlm)6RCc0 z&^%g|czq#Li~I*i{D}wqnWomXf&cW@5Mh=0x+Iody;&?ypP(RPK_H39|V;y@n^~SMw z73uK{nN}Dkj`!0JwLGmintZ3Ex602vO!y1*bKRc9kjK#Jq`{R!*_>HX`bs+V6y09g z@0gF1ZP~O}@Bux5%DtXYO(bYb#cK=9Imyl4rB< ziImoEsCAJ$$}r{yxJKA7EDG;G-P?jckM6&qKTjUupQqy2ljrnjfB(@v_A8sjQNV7V zV~vcXn7`tqm$)68wBIE+r6HV67U5Vp+w#!ekG_kdsn2&q$3KCZlUo3p82;X7zvK|> zbl3w2;Pu1k9UrG`62(lEVtFy{c2v*GtBWBwK&usX4pJ?iD zHBQ9sJPRPnSy=#c&dT!i?1m84&dLK=xh~6{j==n#>ViP0t#L3XFD-S2(9X43Ae!gi zZQU166>=h;l~+y%HVI*M{gMb-o450Jn08eV!q0E30B!0zJ3gomw- zs(|i84f;lT0c$zO4w{=)T?NkA5Kpu9cB2Xa6TC5$wl~T>;xp&Si;x}-Hz{DA1JY&= z>fJ@MbY?y~6(HXb%%+61xmo7(0owd9G~BG}Yz~Z4_8PfEnueYa8L@J`Bdlx`9V`4z z5lM&o(em4zA;{XY3NkS;<-r`cw`+9rr-aA5#K#|SAGnO0k;*&J?oN{wlJeuU1C@DU zOVkFP(~@|scvF42iw{Y(Ag5t(Pfvt%U)~6CbT^9cwm?cBqA%V_INnhqcc+7WzSGGD zTP61m<9O#BIpS?=r^9;@*c0Hzlfu0>;wVgEmjRWLE4yC zvbtS2c@)Jj9M8q|)0i`LrpZ0u86S$e-n35R2*^cS{Fxt^NIc-jJo3HNJCsSqFZ|(z z5%kK*(z2g#m4<#*TP3&3DqcV*ar&OUc=o{l>+yV1fes|2R1M?>bN$S_-uStd)N-Mg z{K7eT#$#N^DWOEr?+=cbQ~PLg{Ey-6bXqARl&w849 zxV~IL+r&<`R*f@B>Z2S*dpt&kyNpZlJ`wL%3sKd6`>2~!pUE#2G}P+4g!UHcO#hKP zegN4BQ6bOK&6QpDj+WIfyF57rw5+~{ysl7P&}AJxcPmXZG=w-O_okybEhDSHPa5=jC>bYzD$D4pK?a zgtvHn1Q#|))%$qBWxIu4IpjXP^pdC1bjsZ)A-tBE?gN9GM)3)}d9E5(D#^N8FQQw; zJ2~Db58zc{1CjZK*+F*aIn)nI?JVo%UbKkE{Dm@@?UrCc*)Yg6bo+wmZ|)U*(gcqPu_14s&jAc!*O@Ft9GAsC@g_NoL4liZ}bEUynTK^b29!0wmt9 zTAuW6-4d23k$-ho#rh?(KFf}RZm%|HiwS{!mGf+*$0^v-q?D# zXAR6=pVkhtS&7P!MywXzCmd8*!2M#(PkA^^XRjonEDI%d3<}_kp<=WcPfeAFS|@fk|2=VM#G*K+5e+3b<5L z@VB>ebLg$*7IU{)vc>!@=H0Gvt>qj4Eov3`2lRT-%=ZhsIZcB$?Qq-#9e^*< zcFJ$vZcll-?*;H}E4OiC&UeX8_Ur9ZZAVVbCeWCRWghxu;7snFz_5^$-AZo1(ce~? z`8&!_rx&;^0vW5hNHg-;AMOf%Pu%e)`!l=wjeqwh`%g&PR#hOXxJ?RM*+P^h0fwn| zsxwY)bVAT`+|WUD^q*l)u|*F)u3xE^j)c63y(sq56KcSl&dS}Iw1Ei2AfHDrZ+jqo z#_2siiDG{1-4>gu@ci}mEuqT6^hVO#|6qt>brW*tK-bQPJYS?$e38wl}2A+ z?lW8bz$LkAq zpP~)2mm5_P)HJ{Y$CZb>(WZh1BVgjh*n!(FQtyZ(jf1QtVi1hf#ND|ELKUP8blcLq zwyll3&RDbUaaSAJW3GWBrFRFsXX^c)TJS|2;pWv76jLUqHPOwUt2qrNodan zLgYH@g~D-3%|N9Ud)}a32thC94&c-i+Omgw*B>RX3xFi;{JPazKqUwWw5olwZz$kw@qX>ThG#xkbF{T8JB{zmvmSkJzDa@xHXjt(pG+47c z*~0!ad-F>xlsa{kps}5Lf!`030Fej}`9MQ&EK}d=g0Mng`d#_O(CL+)f5Ug(?46l^ zlkcqFmO0ShzOBwOw3qb&utn{XsdUM_#4d?)O99cuokt} z-LH%8X<*7~AFAGh9g#l%B>SW4f*cW+lh z7yAd3`fXiSSS|K_Afqn$^basVCq7}#l8@L`Y1rx6Xk*463|8$4Ej$DoRQ65w1)XZ7Qmyf8-ubRNX%oe1R?f zhvP$11`C+L0R7e&NDk@DOBh;Iw#(x=$U1ko_D1NwMdOb=-)A0H#i=F&>K33Q_636x zxomWjk6xC`YRgpS%>9bpbQWnu`moIT2Sq=JYA0K4QBlsxMJH426ovWT$rJaMce|{| z^IVxVN^gZ8yC}UQzY_1pCMqW{shq#64o4Y$PC%|=nQ#`fClA;yCuEL4pE)BSFk_J7 zVwkK|kh`6M4z$^Puw`vOQ)zD(dTz7*AkTj@gXh`u4q18jm~cF;MOYqx}LZ zNk<3b_d!;9HiI}n*^VoZ>AZh9=D17{Q=$p0C$dZL@g^}k&y+apnZ~k&Ov`8}Q>;=gCR~^Ag3`=H7+m~DHr2Z= z*Gp)-JOnN2+aoXW0CBzaWIKb#osQ_qjiPIvtizibx5~p_XXiSaGK4N}^B;`;2ow~G zw|g%-h8{DWnfQMTIMsMTd-Hdt8Ux(GbL9L1GR6y@^W~+cvl)Dx5%AXqOyvl4z(b}5 zzV-&iH+_EmmLd9TI?eo@shDixMEnGL!a7X!MdI7s0nG+_`3!nAGxZ}=f2^P8;G6*Q zCRi5nEV2TD-L{y~Z=3zDX0++kW!-iUesP~O?S$$2D(TQIIaY+WoOny8^MebNvd$p9I2pyp|fGzvIe_W zdoUO>@x*}{tlu^Ou7kGGh1WyVkWZ1=Z4VKW<^ZQdH%$2yiBPH0Hm!kEPlE4V@u<}a zlVZU1zzp}yhVEoNJq@%$8#*>@+h+GjvmC3f4;Vc}#`z`2+QX8KZ8FhjLr)p?^vF65OaQco zkC|b%%_FTn=sEzLK7J^_uHl4eZ2-M(;wO7+!AlDp>9uXWpZyVuX1h-d31pIL3tgNn~^$_QJwCsQy60G3~>$6vWYT~;{rRRp! z5~Xwul6-||<^h7|LV&y`pI483ZnClyjuCyY2r|mfd>lLsKwu)GJVb)oLypixg69D7 zwHZA!92r=^Q2>}AVbMptHxa;q9(|Z5duYq2NV6pi0gSCD;%kuuUU&7rd}fm;5_{Gz zl<8B+zMZcRn}jlW*R$nQ76bkVc-(ht%~6d^GzXpU_T^Lbg?Nui%MOOzGv!kz4h;*L zgpbf+_&9ymCk!%SIl)r1`a>q3f{7qb4m9E$yLQ{Qaq;0(vKWM(-N%BJBrIZ>#JYCt zwNjJ7y`fx}mNGs?Ir!eEH+?KvNo7S}H^TK<6kHYlMP0cl<9bFseb_b(<7KZs>={pi zpaFU7`9NV|I`w|{$ROznFn^i@%UO- zARH9OZ&0M)!1-=e)v~#d!Erw|^jEPx>`ZQvIPVOU=AbWGjA`4CQI0@$%mi|;`zSKv zlIVAPGs$N(laoLL~JTJ#?3 zZX?s8uk`vQmCtpZKTal-0(m-OYQ!9sXUK`;X!)av-FPKd33cc|gxf>1vCcmxz7Td zqC-{Cm^Lg;V26(fK(fpuTw=Z6WAS!{J)qP8)huA2uPD>T|EdI}DDz_CxXd&yvn7B% zgjG)xn~nl#&$3X|vg zl%b5OzpkaH@RUV1YN#t3((AB$sw3+g4u_Q{%%|{MWIwG^y}GW_@3qY#vNd}KvWD3m z9D(pOG{@Lr*8;h^i&}-<1&V5*&}j5bRI+Bdy=I?7q|0QKT17slEdb!3Ag;<93f>6DE#iV4wy=q8bTKX5R;O z$3jKQ?3z#~wDl4jLw-K!_DCi$El98o3latf_QEo3(Tm<`09p_5eb9Ri%X~Tn*>Tv1 zY%{Ivljdtc%|jR%^b9}%;v&crL*!a*-Ncsr@Y0xJNBd6)1CmmQ02R=q5c_&4?^-ZS zP>>FW0g&-&C0Bt=7?j5Y`ZTMDtPLS;6PY|%SOM6|vVj4JQWN3tk@ZqP@p8V>so zs5e6EL3_;*R0x_r69^op+o%q}pVE&#Q65>IP&Bg2V7*XzTs5dA4Gq!(fmEXmHt6?J zb212h27T70NPw(OwfA~ciL>Om&^hv^PRgId;I0T9vo0Nk*ry9i zb3h$Ym;-*S8`%yKS}Dm$sXSPghDJ_$oC!Q}h1JzSKJ3azBPkXvZ$KhrU^+z7>5&<5 zdcc?3eZ$-}+cv|GFtz9?F!wDBc$i^33f( z{7AD3q(dKpyqYK;^?TWCQwKH&DybYnL$ZeH4Yg@l-Aif`*D>0oY@oEmZXehBW9X4I z!XjZD+C$N=e*lnsv|D0!sY4EmTlmSFw5UVC?G;uu}G-6ST{|G zp0!kqI?7%Sh(tZy)4`{lV4QgKCE)%LZ8YX!7g8xphEoG92&^qLl_QhVOs(e-umlq1 zrZ&BYBA)G|g#iUzG?~K`G4YfZYpl>_c3!{EYMDqZMs_-&su)#OYY?sPHe!SAApb;; zi%-#+UVAX49dD&K&VZUD)~~k_zm)20Rbcgc85LMb*z%Dy*geDn z8%fi^#`5^gs_ucXGUy5euzcn_EM%Jo;hJ`J@R@hkw6o3*yM}y9N{V9e(AcdNLqtO9 ztdF}emBv`sSgnUF>!Du^qz@}bx3CaCXH@2oAnW-NR?b-P(QJE3YDnh9xWkStcM`Yn|n-SG&yeF4yO}7$@Kc0ex5i zUF`D#lF$+mk75~smNzhHHyfCeZ8*p_@RP9y;HA(e$dG0aKk_y0M4+d@fIh{$KK8bn znaRx6nt|=1(gz*4h!mM7Hs8ljrFjFVnWkE8e|7Vvywto=ubG-B3!ZJ$QZd(zzTP^r zhm1pP(0J3>5$rH;!)^wJA@-u&r%gHW#2NEZ6dhify@9+#7u_Go{nLoF4{)Ab;ORZE0xDh=w3>GzX{<;4?!78+rlrJ~WR&cI;cwu-)zXKo*BYDWEIR zBfV!G0Tw`Ca7g&ccjNj!jRuihG8%UsYA7J@L%T8@d(!lKy+qJ?&~ zu1%W}HtyE}H=DHwPhqv|wn#c%jZ*cRjaCWaU?NZ;LuBjXG~rWF`QlA~7qQS_;wh$4 zmJ1BxWPnNfkc?jaKFa3w?CB)l_4|))Kn=9MT8Myyp7j(6Hf{|L%o$R+r^7Z0`j!uv zqz}TJy0k_fdO(QsHsW-$#7~SCr2iNd)SzV6-o^D zS=Y0t5CX(oqie{gcneqvVhFGwv`Xjx6KGlo39s8nyElGvS3CnfiELj(_Zr}6P8+X? zu&rG1?V3Y+`0W9yrZ(p2(8%L;D`{5Oh6JbgDCO<)yd}o zn>0~xu&nHv<1itl4Snqb2Q%d}OC@pFw#L*T3C5zbZl8dd2n6UE=o5{a7xn^eLoh2a zV0dFl4DSwTytYL$62SpJxXXw#_<*+m26`_28#I$T$>9clGWj0IVT{oawTJ1jU~Efn zJajF>MOZ$-ikWz3Xy2o4s6H$uqi4#eYztNsYA*vM+Lk3Aat5Gov{}I{^=cH#C~S7_7#H{v*EK`bm2=9x$$ZF=>M+`p^aqX(;&f!w0p z4A-^c;9MI_JabQ512>a;L2pQQlp)zy&`XPQBEPn1rv!u)(BKf5CV|IP1W`~q zdVS&|puCA^rX_c~2o!>-J?P43rg-++JxZw*!zt+6vM^3UUp|ZZg9g!bNFBqI3%xFG zJetOIXbo5c)ya#!t2+%)vmz zZlVFMFv2k$Ff5fGI!mD%+Q+R+BTpP)5>!KzMB>e%%ZYf#IlgrgN{2ZYo zFs!7qEcKj{KN5{AKr)C_*o5>3aJlYFG*vxCYaEELsz_IdnooXz4CU`L4w%5bYi!e)`N&(d`@}~vo|1LYqLjmz!-okt?Twn{fC|_{1F1lJpe|gb0TyP!PeB8aUG}g65Feau2^YzI zOvUpB_$g{bx@Wxv0Sl=BWE$oiJ!^re02#o0AaxCp23YuMpaFgSG}3?`5dkH=L)!Wbx`F1ZY`Y9--5JOLEU|*o^=ODN z!Y39n4EdB3)E}T%-B9oD4uNpvCzIcYt z19a?VnD1t&`)JS(tUmF~I?_km<4~u*cf}(=EkR~~qbF3^K{xx84c;I#zd;Us0~%p| zOa=#MfZR6}yTeFA`>1O%@yzUt000f1hC%Q9@+lS=rVhn(w@*iP#4}qYgv21tI{13P z#4|g*XM$)8Ou-O^y7DQC7OB04RPi4245;tayVW|Adbr&6jtV@UEPNkOnf_p?WQ+vH zPBnu~e16O@GU!Z`w-n$TZ5;XKc$QRM^|1Lf#|xkFa8ZezrsRbT=+dL%VRjJ3vJR>5 zv}iKH5VhQ8=z0vx4Nvv!(7QgaKG~~f_Jz!G9>pznU@-22F0~4^uCBM_5pJa$WqW4c z9T_bQ56Bnyo>_C}+-w%+Dl3Y_DjPXcTN)1~%C=A=sz@?ZHO{u2$Ofw0k3 zTh2V7B+1Ni;k7?qHz4SBe7Fr|Mi|3Aqk)x)mPg$0<4JIKPuw|D9O75eAUJ^u58`ar z-{F-PlM{`Ht{PfEURxpn7HD+BT)rbp={q9~eEQf~%u@j7``_5!c;$OHtScs>fF_$E@DX~`7_bcJ zHO`WlmYMvL34F*v^0o5D=*c|dix>J{$TdOx6YleIstJoIYvLVoguKNIrp0;yp`9IL zxY=zHZuXCA9D$srWp)o0hw-NXeqNbKR;M0A(=i#FPL1DCqXUlbO~VZNSuoN}#(y8I?zo>_kpPjEF(<7ipMi1|nZC82UlS~7RStdda_yU_eR z_5vq9A-a&EOF0D>pb!}vQ*by(yqe@A?I@#7P6q_82%&-dzBf<2gb_)gFp+>*1bm6p zOxA3&`LWg$rQ{eF^Er>l4q#RdG%-u9uzVCvY#z^p)w+WZnK+n6r7~QK{-L9=rnccQ)ryn z8%_3NyB5Z;N%<@*s4PgeHx_+~K_9(Lbu^3AvI^77vyHw{`8YBGw?lUqby`)E3U!`F zA?Jm(Oq5~iixmG#5YJ`6s*E;Axln~FR#7*NWM&M3F+f8DFKv|!7y?(FMKXE+8^ED# zyI8+Yb9GrL$DewEo-tE(n?b&|Bs66C0_nz%KfVRf{gXyl%fCfwlki%wC?T$D6ob*{ z+tDJW@e-YCPJ$uMeGu(u5wQJO0y|r;|5)%CcV!)a0l!oi#15iWHmaOBthRjArO=B> zmqNn6Ds1<16wJpIrRED|$|A(h`Is>d_;<{P*C9UI=rnmh#Puqu*%syHh@d9p4HTQK z3B0?h!P4=$MO6xYbiP$l2f*)nji^imcdXb?81kI6A2|kZPHP-QN{R{toyEB^ZUo3I z#geolp_sa^@UybKNf?e>T#6ZsR!0f-cPc$AZ)qt%s=IRqsQkRAv_W;8Fx1{+w^#~_ z=C+$9RW|VyaUvg#?`1W&c50~=AtQF6(iUyY*A}F9PYD!_vVtndOA%2@2ee7zmS9@m zVQP4t)3SWnnjm!=w!m1K#N`yJOz-McX&niyQ9ayhoBK%m&}v&yyH-PgcN)iV#}UK8 zTk1=y3NGMvB=}Oz)9TEX(g9GhbNA*i@|Ka-cB^!;MyK`b&5SFR-?JUH&fNA`{ns@8 z<(;y%dF#mB@x5x0p~?UgP+egX6`#orUB{o>Rqg^)1^eYWuz7`Hy^xf2?a9|8j6#%-LD^9aAn9x3f2F zN7Qg0oGvgA`+&Dt-Cd1L`P^e1$ZVJ86IiHQJ8OD=ULP(rU)!1a+HUr*TB${@G7r>k z;+JIsU#Cs3Bv1+wQa3X`I@MFPy+XswX0od@@;pRuWx3RA-WO~94DmKpEm~CWFjq*X z?%QC8vR9*rs^SZuTXujYclCt%XoMR>xit+_Okjj6x&_gYY zhokqurFi@C21*-bUU1A@Ros*WfNH7URnYA${L(&LjowpGg>_uykE?r&#php(V){_>S(U(4lRj$Hoo)mdvao#BFi& za$YjPXXABbppuL`7PW@HT9^y-s^ce{%VV)xqz>*%GJw0(##akz5-!<3{_r=YzNySz zhjNhu04`VkNzL<8Wd2a=yg3+rJ=Xc>9(Q@1fF_>sP#avUt6z?~TAI4E)^4a~R+nxi zf$fy4{k>XeUPM0Nz@aw0M))rW;pL>Axe+h2mA2xuS8PXC^1F3v_u@1e&zwoB8P}&c zz9hHyeB#b@k;rqF{3DdNY^SL0_loF$UHIt&P~iIP#utR-V$#k+@I%U_B6s$N?T8wF zw_=GCMzIq(nsI$x<4Y1uR@ZKB)0UiQVFM+B3y& z%2pP;Fe(mv!C;2;&VD^XIy(beo01@eB#cue)9Op8}m+>mnd048&cU$QTy)| z@ppZnYy0HTsTtQEj9(DujRyGs zdZS?juQ}H8zB5hO?qRy%1?8*9Dyl>-%7Yy32|3{^MK{X2v_7oSmln@Rj_PWJGD@oS z`cqS9HQ-pCJeTNkh7a{1W%vEXXhT_uJE=B? z>xG(oqVjxLwV zi?0^axg@(Ao^o$`wlN@Z;=^?AJLzf138#D%r6p)LCIhf*?Oa3rMI0q_ewgxefP9{n zSpjn;T?KmH)Yl7jfp*MUOaLp-MDtmi&Xes9;IP2fwt9dO8iTj#BAm8ebt2AFsEGMI z@}2OIZe%I<@asLjz=34P+)Fl|CBG=q9-SU(lH@nf&brL?Sy4Lk^>X8hqws|J6={ z7X;CnmgEeuB6Bm)m9#pjMdDWrFwU|AEbCBK z6YFr!QomYIPkaw|i_Sr?Wr@|aTFp?0xJ>+1Jj8SIYUYgy#9h?YXIJ6eY80zSdH9nO4D3r0q&G^`;iVvwI9PlV9#d2u>)s z`Wm+8HB`aMGnZ6NEMYt9@DK@Te$Ji{X%PF2b0Wo>b4+*Nx637`lL== zcf`$dhFs1;8QgV}28CI3V zO`I{X(CCm>e&Yh~+3 zaNp={8~yEWPa93lzTI;>xFJ0{%{!^NPJGygbx~Gz<;A^Lzx+D8#JBq1<>Scp!YL^6 z$w@daC;E{i(tPLiq@3jVY596lz=hWWMmW#xk?f^fN=2ta?JVfe$C$!&Nw?XF8B7l3 zw0-_rll{8dleRB9`Kjj&_b!kiIzTvrhP8KfW-;&O)t<*ESvNn&Os3vpDoN;HuIfCB zK3?&~=pF8VI{Mj(ljQTy3v^<_xqG!I=EuD~?Gg0y+*4E9=Bvv^x;%9S~ZEWdW0ntrC@^MlsTg$@)u~-qFS^V z!K%IK8eKeUvV?ECUYukp9m8-E&*e8tdjc#252;;=AEjhINDfboQpzai&LOEYFc;HO z(hMrq99Ki3QXXMZlNF=H9?0&8yY3#eBFz$%5p)~sS$S`D9mqF>c5eQgqK5Lucw0TU zc4x*N5Tz?BU(9`22uD0GU$?G2rg9b2iZn~54tt(!yNaAoea6C~QOpSWKo`i@bKdZ& z*3((wZqHHN$G^v*0pQ<})-8Hf9gG5#3}blcV!5jRs!HtTlVht;TJAzYUG+m%XpUL} zCgPqnfO$BOho20i0dV;|?fEd{XvC{R5nH8Vs7Byc?WxinSXvD0VAm{53 zYv@qu92JlTY^a{#$_{Onk8-K!;Tm^bH#^ct#IdGk7OJYyhMhUB%K9;TQe49bajv$o z3Pzn%6eKB&c{95%9n}yI@$zBdiE02Vpm>csQ?95^&iQR}v_;AqA`Uw5KWvVRPsld*jK zerX3C3v-Tg*tOxN9#DW_=BS$z0-z!UD~*T5eQ?tE%(b#|KsIVwY+DyxxRem)>o4$xg1 zs6NW!zXEJ(b)O!1&FA4R4R#;xI9~z3T9uy_i0T2i%ZK8{kLWePF|YKK1M*x4)GiOY zmwFU$rRk#D>}P}i+5>2p2KA4As%|Bqm6kpmjO$0sE{|P(w9|1bAzEqTvqH6gSnSdu z`wDPZXc)}=x4!e_@;0#WAsY%r{!r@AyCex&nYD?zDdUJp-oTWHmxyf4o{ zxC%FFJ%|=5r3*7(>WsY5<;Qp&Ll@|QUyBnvmuIj#_hBL&tSof{=XRaR@OoaG=e6;G z%B9Wmg^$%B9EwV=XZ^!w{qm3aY+`G;tOGs`EJ1DS8fh+$G?#V^X+ylZq(ew6Fh{m_ zF2qrtxhuX7>Y+Rss_MG3BFrzlBi*ot%ZB)#@5ma4+~|TYba=9M{J$K9wStg^3xGh6 zVsl$T%3y>iv&DheZ2D;9Pfkj}XK`(RRoRtP_8#Y(Y%&g%O~y>n7VT=daVU4hFS7MW z?(IKfH7mCs&l-n>+(#G=IUZbiG*9^?U9*oW8mxHUD0}J-qhkzrTaOmuA!pAx`WUdv zU8dFJ3>6P}s2+t@{0K%4 z%mqwLH8^m|VL)9ZO_`n4gIszwSYf0xcn z2tT>q`J4_*AUk7zG>sVgZp=72(5!Z?f?_M&s&LC6Kx7hN+->;YvxL{2>P}QRj?I1H zcqLt%dcr}#pKh0ZomNFEwo2iuYtAf6#|-Qk^`R7EMLp0*gKBb?s@G=eStM?y%4WA2 zM?RV6VVibx3WMr5MpbFBGD0LiU1cO}JBF1i2cuvPOJ z&DV84Vp>MRN0dg;>?n$8t6*k6=zD^7BJK$eQX2a@f26YO7}oW|bHa$d@D~Zx7a~bh zD^rkn4q1R2VU||K?+$TqAXl~U$pWOz*l*5}0}39oQ(jNSnp%u!ys7P)i=Qg0-YEqM zmjNvZH=X%>IvST{Vd0_$UHVVS(HS3g->n`u9q7~o?$mMdBkj6 z^k$uCQV;7Mq}m%I9{~6_#$^V27eHRavr7~d&)enW$Ry@Yg7e=gyrC6HYv>;va{y36 zG_Ok|qM^xI%5tfGa^-1|r_##D(+JE+SX5AB66k=P|+g2pO5a*yk@>%4=>`7o}tM?xZ9-p+b7&OX(!#f?swcJ>mk%|=*=x+<) z>U#ujT#Gx8YtsGaJWe}+yF+q8I_bvu5zk# zEwZ2M$_XJ^V0REXFND}oyVSo|qV`6;JzGO6I!z&vF5Y#jX6=$JDcL19xXL}gbTng_ z9nByNd}G=)QWEvZFjN=u%Ijl4iu}}@uaWqc@(>A7Hxze}Xd8A$e|>FGd{s$I$% zaZ@@d@@DZp6`d(N!&VZtS;b3zg`gxzaDxEa6T47I%4Bn-gJsow;Up?|V5&6D=rlKo zp+E?P^X@j&8SpUHgOASQ>BzaIZ!!6Q+vwf?wyI|}@s8HcV-IM;l+V(e2`ZdVkoVjf z#eh0@k~EIO>CV&W(D6giH)C%+OL_PQJX=mF)hl>P1{y-aGSMn?tyI(7-4YYL`Pd6V zgiRQTvcAXP|BHt=R|#|<=lUD6WB~Cb-1vld+pJihl~V^*#@TpisPPDqs^EZjP3D9K z;kdd+e@b{Pcvh0F%z6x!MLKIk{}*nX6lB+%da2{1^$?RfF?akR5f%&3dXCWFFpb8M z-$9#)=V~zi5a39;|2+2b2*&|5jw|#~n_!u5k}>4HEAsydW+IMYqBwlxf#dn##5|cp z7x@7(8qYmQqeC7-GW{)*VPyO+^6-Osl%(Il#Etn~+t!_7&vLY3w?EWu$LMR0<+z$< z8?NmP#@yipsKWJP=;Xm7_U_Wi|77j{$$SL=<4*QU11h9vMccD9@c(2zivDCh_> zL(L|{Xq7%Zg!LFrr#V@;n5b|W!tfksc!k7JoXK2mE0Ky?4l)p6t1znwiJwp&q)XFU zqF0iCw`#Lj^!R-0=4Y9wM?n6SfRlC2v&lYC3^WHqRp3=j0r`|RS^}hKRnj0!1_~u_ zN&dJW* zDlSK?qIeYv1@@VQjNS6;tq`9QfthP&ySw=nh?gw!T|I%Xa%*8>i4M3IVwk8!m@4sq z5Hsen7dY{W22-Db69FF6whwLR2#O*}!igrU(Nt-@A_VS=EGRebExmL+%f!KF&<*rS zPNP!L)N)Gvj9)PpM{ChypC9o!t@9#;~SEu5bXzzpi;#QGcG{xbIPrzB3JS zSIg$sU~$7q{BS#4VP*SZIqX6N5PQCs{wcI?Eh^kN!{$2TGvvLvktzdVKze{@`>SE; z8igw|VP$u>X0Ze05o(KKj}(;TM94yGu5vxg47Msngi7Cw$cm)^#F}UUpYw)}j47&mI9({HZiHU#OGdIGEF*uCU>s5E|EuiR zm(lse@lU5-7SNsTzEt7O``_@*ib1IUk1DInLwx^mgmJS^*sXaady0FX3*Uk11Xj$F zaqP|0+d1bdLQMscz->quk$xje8z z^&5pBx(`l_LWo8C6M_=Bv11$n+KB^U?SxZCi-rz%Ql zNs`bg@?B*q*vO^72}jBN+iyldBEJ>i%WYgQrjNInDmv+D(r8)-)@fk#n3Ey`JC?JsGT8I4*033%kjg2ZELE$T=q-5vNtGczk0SptSvpf5(mhz30J4_TO6?Qz>zA)Kj6r(N z4ohTt`RW=`J`ho6PJDWa6aN^hKWpkm7vxl8f$PcqP<}m+;8`l(j;!>qwG z8Ti`%W#L@1S^aCffJ+U8ZGV$UR7@jk^O#1vPZ{$p)iPD|SYLdUY zp5zalgo$tjW!~AtzQ8x}MYzCCN@0#ZKT{CQX)2oJo}Pu4GhMzPO&; z59YV-*h$h$<+s$r+5A>^B29lcFk;d zm6F%fd$H+FQYUmHjFo+vrViN%J2TdIQ@1kXM^VXj0LkkBlE1?MlFjJ1<^-=^xZxU^ zhou@Luh~_z>UDV!Z?5&iDe}zhRIhApmG9@Blm~bd6mRZii6&{{C|rJLEKQDj!K&hL zDd!OujN@}y>hn^#y$V4_=UyIbPUrEVIsvO3(r)cAEKxd8-LQ;jAe=5SE-wZ%RXC`-0qg83u`_xXjh!J;m_G;ju3h29 z9W9qNe;;RCX#H$fwuH_zt)-&Z^SgR}E9Ww1#!4vH$q|x$gwIY9Icb|cjNkve6gr7b z-v1Ue44uS^&`FF!Cov421nk4uSv9s?2dQ$QTqINa(|DGo;h;rvR3kr^Q93=6AJSNN z4kc4Q)$mN2M%olPWo2he2TV5WZCN;4)m{ICx~u4Pcf=t$V&d#q>VytOh>JF{8q zLd{m=oOLc9(Qzl8=#f^_lI!pSOr8YH5#oIG3zgRnoHnYnZ0 z|71#t0_R#NhwQ?I5^Pj9NNRwhse1v(dEod3qn;Hp<_FvQaJy@?bzOg5Xc8DBz2IBs zu&oC?ReBT%4Qh4IgE-Iymy2}B?lgt&N82k3bjHrXsDL2KtFK(T)IDdB;K=bu_Tdnund z82A#e*+^wyVReWWci6hS8XA^}yN&0~G4TJyn=WFWC16+aK5bqyijeFxj(p%~b8nm~ z`O9&>M(U!}X`6y(qP?T7+z^5U z*tCX2t2-Fz#kT{aulKrLW1yP{M@H{pc+k`J_L7h048AP4*vG5bFteKR+sCas?=*f} zX8hj^K)vta`xZ-pqyVgMkvSLT<83zVVIXlf#4yEMOik(Ol+rUO#nAPmWg+ukD&*hI z`SfxiPc>f-!rn6T8}KtNmp(>@Hi;!{waFs z@_6LTLUup$qa=wE<^>8rVIzJR1wqP2j_)#zoc0|@amhpG0`g}F34$)LIr>G-Xh~77(DYa+?_|@cgw_CEpNl0KqX50%xFJX&-{qH~j z?_WWfg*hAXP}>Kh{=fhHZ#L!r``;!YTeaB(Ck)x)`+tWa57|9Vmx;tQXQk;%(!lW( zyrcuhIP!Q5bH;uOU65GDGe*H|pw;*P!kb2BAVjk#7}OoGf=#?*=;3@v(zZ341*#MS z{XcP#LdVcqvuXj#nQ;K3&??=cl2q0$CNPeRC>M9;I}Z~Y831R&VeWGfBn)G&O#!#x z{|m&#VkSUHf>q@51v*zkLMHoD-0wYpm`3q7d*vi-4xsRGyTo-^5TV`;lMh7;v!6UE zm+6&Of7r8ox~1xsz-I(zXb+DM{YBy(aqXcOj?fm!wWBE3Tpnn9j+aCs1(=Yrvk4e; z$8$04M7n@Eabe9Z{LF7DL*MxVs!C=WO6PZ5MsF}2c8y*yi};pVA8BDgeQ2c15BMpT zD%uPO0wVhQXTYW)lWw8fkeTjCPK6;+z{(psz2N1wxj%@*bsWx~Gf z#v`Zz2opwLYF^xeLPOq}1M7P?WPf$y3ACI#(=F7C1a02JxV#aLh#y0v>=e4ro<$){ z9yCXU8!#~^)bNhhH-MI?fSfg`e=%Xtfo)7;kcBFF0L_?CE}^FlEardz`Two5XDj&K zw}%6xld&W10Y(#wxPQuH&?EN%e*v8nt-!0KSO6;#8{omu!0KVo7<4UtEml`dOgmX8 z5w-vKpZ_n+08Zl4iUdg0LdWj4Voxn_p;l!pCS;NTpIt zGT@{+2WvIP$bSe}>fVcvRl-VnI3{RE9NlDu>Bz8K1+!4{K%$2v3tA1TX>vP2uT`9F z4p2&6<`Nf=ylKyz1n*~niA>*CZw)I&-bQzebSYx}&9paGpDe$X;8dd`A{8<1z-1LR z{b4YlL^Gf6!h{@%%`tGz8OYfymGOC6+F;!!vXVeKkdDx|Y{Vz%G$jcer^Q{W884ZH zuM&h?glG6_0ppJ0`;XksC{WftL|?ZJ!Uf>!^!@*Yr>P`F6sLwEojgPWhtw0vZovME zjD{kB1F@T?m?iJ>l2!~-hbbRH?QS6gt_gdqTJUL7F+wdYXEDJz8eTY=$-x~^A`4{D z_QJ`e<{Y9nlkLQ>YOAtSPvY}G7I>?4NtnrdL2{f^5;rhy8V4GzPmrm+@bGTQvYZ#i zXyu)G=f2U?e(t)@%%R@-^CArngl2x%n~D*B;{@|>@%qiudxUo^Z3XV4Z zvL(i{Snwdxj}NEh=e7OEM~@zTD^Yn{|2B_S5A`mRZEJpvukz6GI@6h<#*L@{+TGh* z;yr4))kLFv`&*clcApR%9Y zl8+8ORG?%SKc+E^*HHQ~WEF0$rgmP@BH3h|-_uRQ!~HqDC>_E_MLLZ@F73f5{MY z;)iHg{!p5iZFS!#c}+6Q}x!^ zi^mvvFlEqJ{A{s5PUs(g8RLI>efjoQ-rr5PZ00^))sXMp0JE+#=#sT!6}v8(b7)1b z*!M4WY>hx7yIHa@JONHW^3bzaa$qF!=yObJ1JnqFG|uD%DGp@PK*6IWlTUI}fD1G3 zF>zN64@dko0-cAWD2eysg+c}hM;^x&ggTq3G9iKBK@8#%&RCK&M*R2ww-N3vVbs{1 z>$7Kk1{FA`3)DrV9HO`%RJ<5-XY#_=BvT+zaK|m#MS9xWc~Jb|@u3gW@*ICDU0{dt2~ap;*xE+|gO&#$|I}AK+ymqTY6LZbwgV+W0srqUSj+tVztFG&l9xkW z!Af_-!}q_1t~W*RWPE{!oKUsofTI1wVHyG96GohU?)gM7#W^5&&plMg6&u$+&R^pB zM^5Nf3|xX}l6j`hei%`I01Y1iaATMmBI8@AWHE0R3N7%+1L;e#;A;feC3KsnY>P~p zWEAs)09Rmm=sR(nea~HZxy>Hr=CSwxM$-+B2xeizg{6*<;srX|jyP(8=@j9;g_g4= zKp!$tA98RNa5;>^!x)sm8FDH=ZDZsYcIe}1mOS*PY|7n*Pv=VB|CS_Nj)n}=;S6SG zD)~-fZdFk&A-4)x{nF^_JBYDp8o{CFW)I;WjUl?T;M<3&MNPxvECG2Z%WH(BP*f=i z=oX=-f=7YYlq%w%M|0pO@ku$fdGqBHc~i0st$uNCKvV8@ZdHq~zAtlTmAqf7hKu52 zjS-#*Q*9SvgbJ%lqD#`uE?T!pqUK5Bgy>di)yJo}4dmvcoz=Gjswxa!B0ER6Q9Xy2 zlk%J%i}Pyv&&>I*+w3$BGP$_lWSM`H_nwGfs;G8;w}y`KF}xaV!?t@vOE13R_HMUp z^ld{wI5G~5gWUDtJp11p)N5z;+Uwm82v`?%A=q%3v>AU1!=!rc^?Us4FfXuQe~yuE zv|A5oR@tlv3jBBx$KKchiR}?yRCxsJS8QT%JVxW=^~dl34Xh6s9$9n+6^E=mBz6dD zpT|e?#j{1?9ZqCDNzHMs4g6Rr8UY!kK@b61pV8sR1xa^E>tjfL=(0WN0q{BG!?jKQ zFRkCw;C8p=JEtd&+y2cLF_bTtG+RxV-^n;0g&!%QirE1mX?WA31|0{|r==ix;Qnuz z1B~`TN(0rCsO$!ct@?nOmfSTXxdhZ1R3JpT#v(_N zQihE@ydxgov_YyoLS+iCjt2=W1kp7Y+<{fl-7P##3SB0%sF3`c@A^7e0|=(j`a7PU zrmS|`INF=z!L$5ZKdTL&Nn;c|`$*>ES?>a&N7 znC`En?tpT$#m?G-we^8+3xSY}Yo%XTboA~C+wK%J@JuMf@3?!D0PmPO zbL|8&(M^RwMvFA^)5GMltDd~lQ9T1)#DxLvSDuE>c{tRiArFB1GEE6JL)>a`nqTaA}&=GTy@H;)YQ2VSIod6#ZFo?kz~M*()U zpXP|(g??Ig#G`SciyrNwfmHU&i!+NB-fxX&c44Oulz2VDxSc!mkT~)%OqW0veV8mB z=IBqxlTbK4&Xo>KK)q(!xy+0S@P*libqV?;u+#C3wjC!O(;DcOrI|n>Y_qc*Bh?Rh zI*SA^PRr-3D}Id|i7u<$9Nq1uW_9Rj18CxQ^~iUzJ6O5%NGMZuM|Z1|6!U*9yqLR{ z*U0@F^NXuq8!C*7*rl@WT z{6jUY1UW>*N?1wbGG%1tVPrJ;Uw(juX>7uQSQl+A$-D%pPZlDI+3)E7*5F``egTItl<$u6n-8h1T zAR2if6FmazG>MMcGKxW{BnO$?=rr}x*HoD zWB<`9wIuy(Xk%P(WT1lIDp`J2_d(Th!A~J1G^>5pSTC!?^2pl_tG7F65uH*+E;-M! zP@yt5>xx1A;pajxsw%EvZgRLV072@Mai2BVTt8klSo5KGzMb#*$>|5!*0^Rz1+1FBnfNUNNsIm8{yEy}E?VK*qMHTQ2 zw{v-OOtkSLCSQluw;BT1dqbDq2L5`*Aq$#jD?!ujON|U7*;93+Z{Hb3k?$}JyyST5 z<-bjIh3<8hAX~BQWrcZt>e`pN$5|Q`NJW8MzfWL>F_-{%t4#nHfRzQKgi`UW^{A9Z zEO$+aqk(r^b}vnElCoBsXx9+}U)Ta0doXqIG{uP)F3?gU{Cg%a44JF8AG&5><;~AB z@vl7OJbj-;;Zs+=_qZN-LpHQY)Q$7R-9S-lS5^&gJ@+aFn#khITsYXoRtxp#IHXP8 z#~-g&my3y_NF7(;H0V|fsW9mMUV8GwTw^$kCY>pl|w2R`nGO~kP0H> z3tYA$EV2)AYdeq9@}~7e zJat0vR78^7q7_8$L#6j`^Drs*N+4&i1<0YGA59~r;?@(~>ivPX@F8^(`y0<48dSY; z2Ys%Rv@<$A(nhBptiC;mmVP}~Wzcj29p6oGs9A{#NG(9`4oT4{7nLP3z)hB0o=y>& zlY`TI#H-I4ljD3qMLLV(QsAT)^wFBC<-_ zvnN!{TEwJ0!#zC4yxscrr(qTHvFZ8IhEsLd$3?%q^7cLLvq@CFb@YUTGKl%Rj_0K;(7@1xD^{cM_H|u-< z??3m_B>w|MYBs+I#lku=jlb;o{kY-Q_FqN%ACo?z<14 zAC1BvefZpSfBrEVSx>wlyzlqj?_LIfe`Qbizw`Z3@bbjHU(5f!Y)5~#|I_L7cMp$$ zsOQ;#{pz{yyc#Z^yogvJ}lh(-@B8)-Mw43hpYtOeL&wRc!_2jn6u=e`Jw@eW6C6qIhcze=|2naP5%D# z_t4wt$?Ko|L+mj?@c8-uGxOR0L;H!h^j`0Ozx>&JbmD&JzZ-q`@+bcDJztFMr8Pc* z(F8x8j2{i>V?E3BllMdTb3Fu=O-z^#(=!{ zpg$*nf0Yt8JpK7OVC+nXJpS1F+roMLV>-J3{jfC4g$45E^cA|X?g?{NZhe&IEtgL) zC@~qdP-W-|;E}e0qZJt(pH%FqeTEC`yAl%xzwP66}Z?BQ=`c1tPm-^!-+T zBvZ-IE_q5Xozr%})ArGH2^q*7;&hItr$O7$L8Tg^uY9Fe$&w^Vwt@+I@cJm)@f`5YO=IvUJ zyiv$e-rh~pZ7efFu$|uADy6w0q-_k}O5p&6!~eg%Yu$0%NY?-7DH;lVvJ05jWiB31 zJo2$O@qv$HA0v~64`M6=N~9#ND3ME2o*4z%r`bRI4*7NbVyCJvWZy_p8c`lwau8c0 zsqSV~cXd^D)mNwT;q9NlGlqqQ7(fBNHbym!p=ahdaui1(?pxoMv|7cL74sA{w<%_= zH$G8^$Hz~)?y)~O{_GRA&JA9hl7^lO_hzeVy;Ks))38~NODLw8Kr{FAEP&v=80-+~4azwY&>6D#7zQKW6 zjve_Ublk9v;DwzTjP!}}NBn}Fx+1hglt(}mWDR|9aC_NeP0_Fd3ZKu&%* zW3fBpxV~kJAv8HeYZMK#xs7zGPeeKb@>ZSCr~v5sUT%x$=^EdE5s^k@rNJE{g>5D8$Yg|9mYzmq(Oran2xt zDvHFgT#24TB|2QdJXT+c<~N`q0CqM7BwNfJukn zl(M(y97Xcnf+1P8S?Qum06sDROCGNa`yoq_q!Z^um+n`5{fA?sVLj%`)@cFqsAE)4 z6hz5u6<<;y7PS^ zs+yHrkX)IlRfMIr%4yYd1w&Jq1ls>f~3X$xGqI zXu`RKG0DXf#g>(m%rV2fr%MpFNOD{(p3%g9%$6~JEHNpHDNs}~&)Gc@wcJ@_y7ZG9 zUn%kpxLI4zzf}OpE|Tz1--`HLIXL<3qWk#b^voHIdXG=eJQ1{;xBqAyjJ~yF(;^Ji z1)(5crC75^D^%*3JB)1`QAe4WeD&~oEX_z4+$4(!B4# zgV7f|G)m`IxW)yBDT>}+yW2BXI=3xp{j0w*t&Il}s6O_9BJPPTLCYeibTTQ|A~AFa z66O-)YvJn*7KDd+Z+KU0gUe`>{9snotan}*c}p8lp=RgqfP62K;apK&Q}!r)(zZ#l zJ*r17<{saoiuJDTRJNUn9Xy~HA)w$ObHj+NVKwhyXYy~rQ1IS>;!&YVs+*LB(G~;& zn<2}&ZpRMzBIba>eC=JTloIXH75a_x^!=PW)mlx7ZevM>R^%*Vp-@Yz+W`q@tQti8 zV_1LR6gh^c=X>=+;C)o$#qsT#6gki^SbIXo?q3?TC z?hDdjL`z=T&s?^P;Hd2WrOB~n9Wl462?}$jR-Zkk5HB62@U84~P|)76SZd;QP$`gc zSynoy=rk`u7NCfA%Dgi@ixgF~7?hO?q9UCRM_vKnQF+-&v_@+SCOp=I_1y#k#V1G* z_KhwQs~6h9;wyoa$9lKJh+!GRlxCG7KT3PT_Z=qJQ+9V4!Z zlZF8`1lS7B-9Yd7eWiYYQ9vR=6#ATj=`dLgSl}!uDjsi|uz&I%TeziRNgvcTE?d3s z38*$7-EiU4i)rS^m#uEgJQgS!c)Nsf3F1b~gZa3MdWq&svy!hI`Z3O=y2qBeVJ~d) z8Mh`$keLw=R}*OD1r@eR{Ah~Tk?=9R(u2n@9`t&ko5SB1@PC^xY54dM{-?r&-=e=0 z*MY?R@;l`RZXCaLXW}QSW9>n&!(D4|>%rsi3sN7+I-&&LKu0Pr%S&xictZJyHM~Fy zD_bz!0p(?->8&V~3Yy9y6h;f<6rN-O0nKx*B`gr4l|<``!e876)l!jK zREL*Ie0sdatS3Z`qa|Op;~kZ&EiKDlB#ds(?3KrB3bS{2)|S873Wtg;#+9+aG?eA- zOWvLZPF0>8yJa8G;QjizCthqz%p?Cx?iJszCvKL)ZuIprE`PHvaqc~~_dJg875f;d z;8({xI~L|i==Tmc4hZ>CQ%AaB{b4&~_ zi8fJ{njQE)63nH%?RIS3HP8T6f@{85{6{FgIrP((S#c(_f@_946V6myV|L@5OUIxq z+@AOn!qtrLLu<9#{L6K@Cr%KXJ3Ge#m%6UjW}>^u{hRlhi?w{%^ks6`YJ9Q#&rUYv z&G|XH_P!J;xLhQk{uoa0izFOH$+eq!>xsgOKyAefe(z3*ms9F-OW?97_d9ZP_Yi z2A&jfa6$tPPO*Sk_YwRf*DOD>1iu#A<5kOGcO6J^bTBUSNJp=~QI62X!=5oUIIoC+(fGk|`l}Ea(jx-Xov}r^#aC^; zuE3H>+{G}FStLEoOew-!s< z*HJi12a9Cf-ahg3wh=?Gm}{%4FOLf7T#K+3o%Koi%?3%Ey5Vw6oiYv3wrITmq*g{$ zm#+4X`owy@L28bJ5lGf~1El0F!;2u;7_2{cQG=HGcljU@gfHl*I@>Sk^xx%IXW9?MieHHr@U+@(x;EM(9cA;b=!fBeh2QP6&#yt(rfL(6vTC)ibxGfemS1 z(psO`Z|sm4lJe9XBPu$n!3FKtcVU6n6sNy>bvMLM+^G)=76lA_lzrSj;j22RCc6I1 z4GFF{(=?;irFP%N_Z@g4rx;YD>=BuE|+xEE_Vk=&?`o&&{uvzej_n zXegSlNy1_z8&WXuuTR9s2FOdrwP&zj@t1cRs%De;`uwm_`)C9USLhNLKlRo4+?_SPrHwJ%@v z=CAZlbh5GhU5~6qe-d5ui#vC`_{qm$=^Ld0=2PqRf%@Fy=qO+Pbt{UEk1>+!d!)9jnU*s-olG+^BhGbSd?-S*Fli?w3 zA)*lECEpvMY*9~sWo(FLICT|3sh7T7!oE4;`LP=}La-m4ci8eOYKi{qQb;u0C;U_M z>((7a=_HCPssq>-+*XMAJ!{x`K8R|r=K6!(aFxqL>J#bP-K`QUn6L>ML2;mH$fp$P zX|qcxFzroB+~F_+Ih0dh$_0NeZW+k;CQw2#bL==2Dq;Wy0>UmBcO5fE)akScR<^us zVVJgnCJ-0_6f?wA^8VG3VJirPNc5{>3J@KzBl3trA#gKQG99RXtR0VL}26nCm zc(QyB!j@>8B^<%$%ZfFx9YL>cjs7A_^!GY2Qqoyn=i|V)A@~XLZt) zLF$q;&wZfx_7Xx@rLr1SFU|ej2fA-JL$pqgt3m(LgwTDU|JJfcD|22eb42%w%y-Wx zt(NK4S))x^>x!wf@qy>@+1GyH$@n+7BgLQte&JWi4Znwlc zC*)Uv2N*h@HT8ZU6pw9W>8}75FouRM$igBU8iiig%>dKOAt$t2XjmIB9ylLPsqbXwUBH)) zwU$|FGDICIW$I43$xQbMjUv@3;`D??#C4Q6AK zCPUnyfMdb1-RtyDoKDy2J^8tNcF}!&aeC$qM!m-;XI>jsGm#yO-2q%9SdBJ=P!%lDK!xjl6mwgZ2gA#RePrH=% zLtxI4S96@^)UN|q+(oL4@gcl=LVohc$Xk5;ZQ#R|6dFO(A&ssBpZ@4w1Pd~lJ|yqK zp2ob=QTC8rVPerk@_G?wi{v4B5iLOZaubiX^pl&7{eho6%HKS;7UC;hSg^yLC{ADe z>|7^qtj!(0qMr<>bK~^L>=yo#w>Wtk>A*T8Uf2H7Fq&zZ_IXx<22Po9f(GZqbQupW`q~M&ufn&y%-7N);0#j^X{Y~$m)=Un@ljdIsaLt8q)FkaZ4W|q}= zf7qCh@}4vDl^EyeU?uDh8JD8}d0!dH#(3YCV=(=BcOtZ%r0M5p3=CrA``$h0%uiQW z&Py2P+Pu-s#~kN6uU`4=|6(Yx-dEoVLyC z^2qq*fi=Y93g)My<=)_L+S7$U_kyt^eUv&FgNRH&{ejQmk#5OSQJzwHQ3)xly3ma1 zDoTf8B>Our#Y>V?&!+9b-F#oq{R=%g;P(S4nk1~BcKm!xXU8vYP*?! z0J4+!{(w5?(05dA6O0z~sgK7M93H9!>Nl_QK`KFiU;g#&>mSG)+T1un1*X-f*A&cO z`F`P2m7?^vM3evfH$lagyu^1@dX>A=owXHyTsj9ZLNg8*DXf7x0%AESXwcJJpiV~R zx}aEa1Iw6Qge8KjqX_d_Iwu{H>v0^UnDmvd*9Fk)8R05qHHC)-{;KK6bfu z9(RhkvRlVpWRXHTVL^1{XfuAXh&bzZIvt@^>*&aDBF_yuGP3278Qyk0KLpiz1|#kE zIvuGs7W3x?ZW!f8MKP&D-y7V59HVW}xKrgBdYwlP%3WtZu*DO0Swa^3r6|-1yl@tL^f{xc&|r>#rVdobXU2|~O#Di>ZBb~Tc92*SMC|rPSNzj_XSrf(ZckCRqSp|ZF?nqmYM%`U-DShX%xYDO> zMn{FTYN4bvluzf32BtL#E1nXHPG;C92JrkWgAIdToq6mCIrV1&6}Y&`4zRWNK|&_5 zB<2+1KsiPjeflFISAj1J5b40Dkm^9B5eRwg`Lfk2ga@~7<0P2539M2#$WN)&J9tt9 z0|X`3dIyi3Kl=JD>N8NeWdyq7yTfV^VpJ^XqWff9-(BG#h7F+>pl`5 zZkx{d$cyiMyu}4CNL`Sj-i2)MV;p>OV{Ex7o%DOfRo<)!7O+uwY^ChjPSvqSQOPf0 zgdM=Uh@JDp$tkXuYv-hW%Fe}(+aO(jQO@y<yH zSdrpn@1!``!_RkGsB`yYFa}HLExfU9y}I(U)#A71B`Nb%tY1*kq?12IzYvk{VkLt% zEJW>Ve~zbkuoej@we-jiG+m#(`|+<=&a3&v4Kx3lwm_*i0elFie%OAN#lztx2KK=I zck3s=V(Lx~5ZF@2W0_EeV%8`B?Ptr3m_~=Ox^gnV_{DkuR@F148KLkU4_Eye6~^>m zIw1Cu&YTIb*s|`oD5a5XCex2JbrN^8BE%ApIJKFe?%!p~pQ$AdBJWl_Imu?!d-(tG KeHBDh&;kHlHC=H4 literal 0 HcmV?d00001 diff --git a/tests/resources/parser/test_data/de/meta.info b/tests/resources/parser/test_data/de/meta.info index b76dd9c3b..d525edbaa 100644 --- a/tests/resources/parser/test_data/de/meta.info +++ b/tests/resources/parser/test_data/de/meta.info @@ -1,4 +1,8 @@ { + "BSZ_2024_01_29.html.gz": { + "url": "https://www.braunschweiger-zeitung.de/niedersachsen/article241536118/Heide-Park-Eroeffnungstermin-fuer-neue-Attraktion-steht-fest.html", + "crawl_date": "2024-01-29 19:29:19.952428" + }, "BerlinerZeitung_2023_04_28.html.gz": { "url": "https://www.berliner-zeitung.de/news/550-kinder-gezeugt-gericht-stoppt-uebereifrigen-samenspender-in-den-niederlanden-li.343191", "crawl_date": "2023-04-28 20:25:16.328923" From 90cf5416e68159e6352328a3a81423c0d5245d23 Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Mon, 29 Jan 2024 19:40:48 +0100 Subject: [PATCH 2/6] Fix of Pattern import issue --- src/fundus/publishers/de/braunschweiger_zeitung.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/fundus/publishers/de/braunschweiger_zeitung.py b/src/fundus/publishers/de/braunschweiger_zeitung.py index 51e3d3929..5f49db8e8 100644 --- a/src/fundus/publishers/de/braunschweiger_zeitung.py +++ b/src/fundus/publishers/de/braunschweiger_zeitung.py @@ -1,8 +1,7 @@ import datetime import re -from typing import List, Optional +from typing import List, Optional, Pattern -from lxml.cssselect import CSSSelector from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute @@ -17,7 +16,7 @@ class BSZeitungParser(ParserProxy): class V1(BaseParser): - _author_substitution_pattern: re.Pattern[str] = re.compile(r"FUNKE Mediengruppe") + _author_substitution_pattern: Pattern[str] = re.compile(r"FUNKE Mediengruppe") _paragraph_selector = XPath( "//div[@class='article-body']//p[not(contains(strong, 'Meistgeklickte Nachrichten " "aus der Region') or contains(strong, 'Keine wichtigen News mehr verpassen') or " From b14dcda7a69598c7bb57afe142261246d6eb5ad3 Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Mon, 5 Feb 2024 10:44:33 +0100 Subject: [PATCH 3/6] Address changes requested by reviewer --- src/fundus/publishers/de/__init__.py | 8 ++-- .../publishers/de/braunschweiger_zeitung.py | 39 ++++++++++++------- tests/resources/parser/test_data/de/BSZ.json | 4 +- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py index 3f6cdd89c..a7f5bb31b 100644 --- a/src/fundus/publishers/de/__init__.py +++ b/src/fundus/publishers/de/__init__.py @@ -6,7 +6,7 @@ from .berliner_zeitung import BerlinerZeitungParser from .bild import BildParser -from .braunschweiger_zeitung import BSZeitungParser +from .braunschweiger_zeitung import BSZParser from .die_welt import DieWeltParser from .die_zeit import DieZeitParser from .dw import DWParser @@ -176,7 +176,7 @@ class DE(PublisherEnum): domain="https://www.taz.de/", sources=[ NewsMap("https://taz.de/sitemap-google-news.xml"), - Sitemap("https://taz.de/sitemap-index.xml", reverse=True), + Sitemap("https://taz.de/sitemap-index.xml"), ], parser=TazParser, ) @@ -200,7 +200,7 @@ class DE(PublisherEnum): domain="https://www.braunschweiger-zeitung.de/", sources=[ RSSFeed("https://www.braunschweiger-zeitung.de/rss"), - Sitemap("https://www.braunschweiger-zeitung.de/sitemaps/news.xml"), + NewsMap("https://www.braunschweiger-zeitung.de/sitemaps/news.xml"), ], - parser=BSZeitungParser, + parser=BSZParser, ) diff --git a/src/fundus/publishers/de/braunschweiger_zeitung.py b/src/fundus/publishers/de/braunschweiger_zeitung.py index 5f49db8e8..b326c4c14 100644 --- a/src/fundus/publishers/de/braunschweiger_zeitung.py +++ b/src/fundus/publishers/de/braunschweiger_zeitung.py @@ -14,26 +14,35 @@ ) -class BSZeitungParser(ParserProxy): +class BSZParser(ParserProxy): class V1(BaseParser): _author_substitution_pattern: Pattern[str] = re.compile(r"FUNKE Mediengruppe") _paragraph_selector = XPath( - "//div[@class='article-body']//p[not(contains(strong, 'Meistgeklickte Nachrichten " - "aus der Region') or contains(strong, 'Keine wichtigen News mehr verpassen') or " - "@rel='author' or em[@class='print'] or contains(a, 'Jetzt Angebot und Vorteile " - "checken') or contains(text(), 'Lesen Sie mehr Geschichten aus') or contains(" - "strong, 'Mehr wichtige Nachrichten aus') or contains(strong, 'Täglich wissen, " - "was in') or contains(strong, 'Auch interessant') or contains(strong, 'Das könnte " - "Sie auch interessieren') or contains(strong, 'Lesen Sie auch') or contains(" - "strong, 'Mehr zu dem Thema') or contains(strong, 'Mehr zum Thema') or contains(" - "strong, 'Lesen Sie dazu') or contains(strong, 'Lesen Sie hier'))]" + "//div[@class='article-body']//p[not(" + "contains(strong, 'Meistgeklickte Nachrichten aus der Region')" + " or contains(strong, 'Keine wichtigen News mehr verpassen')" + " or @rel='author' or em[@class='print']" + " or contains(a, 'Jetzt Angebot und Vorteile checken')" + " or contains(text(), 'Lesen Sie mehr Geschichten aus')" + " or contains(strong, 'Mehr wichtige Nachrichten aus')" + " or contains(strong, 'Täglich wissen, was in')" + " or contains(strong, 'Auch interessant')" + " or contains(strong, 'Auch interessant')" + " or contains(strong, 'Das könnte Sie auch interessieren')" + " or contains(strong, 'Lesen Sie auch')" + " or contains(strong, 'Mehr zu dem Thema')" + " or contains(strong, 'Mehr zum Thema')" + " or contains(strong, 'Lesen Sie dazu')" + " or contains(strong, 'Lesen Sie hier'))]" ) _summary_selector = XPath("//div[@class='article-body']//p[1]") _subheadline_selector = XPath( - "//div[@class='article-body']//h3[not(contains(text(), 'Alle Artikel der " - "Serie') or contains(text(), 'Mehr zum Thema') or contains(text(), " - "'weitere Videos') or contains(text(), 'Auch interessant') or contains(text(), " - "'Weitere News'))]" + "//div[@class='article-body']//h3[not(" + "contains(text(), 'Alle Artikel der Serie')" + " or contains(text(), 'Mehr zum Thema')" + " or contains(text(), 'weitere Videos')" + " or contains(text(), 'Auch interessant')" + " or contains(text(), 'Weitere News'))]" ) @attribute @@ -51,7 +60,7 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: - return generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) + return generic_topic_parsing(self.precomputed.meta.get("news_keywords")) @attribute def authors(self) -> List[str]: diff --git a/tests/resources/parser/test_data/de/BSZ.json b/tests/resources/parser/test_data/de/BSZ.json index 677a87185..b4b242ff7 100644 --- a/tests/resources/parser/test_data/de/BSZ.json +++ b/tests/resources/parser/test_data/de/BSZ.json @@ -6,7 +6,9 @@ "publishing_date": "2024-01-29 18:09:51+00:00", "title": "Heide-Park: Eröffnungstermin für neue Attraktion steht fest", "topics": [ - "Heide-Park Soltau Soltau Freizeitpark" + "Freizeitpark", + "Soltau", + "Heide-Park Soltau" ] } } From 447ee9e0d0acf24235129298f8a0bbd299e0a1d5 Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Mon, 5 Feb 2024 11:23:38 +0100 Subject: [PATCH 4/6] Simplification of paragraph selector --- .../publishers/de/braunschweiger_zeitung.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/src/fundus/publishers/de/braunschweiger_zeitung.py b/src/fundus/publishers/de/braunschweiger_zeitung.py index b326c4c14..a73de83c9 100644 --- a/src/fundus/publishers/de/braunschweiger_zeitung.py +++ b/src/fundus/publishers/de/braunschweiger_zeitung.py @@ -18,22 +18,7 @@ class BSZParser(ParserProxy): class V1(BaseParser): _author_substitution_pattern: Pattern[str] = re.compile(r"FUNKE Mediengruppe") _paragraph_selector = XPath( - "//div[@class='article-body']//p[not(" - "contains(strong, 'Meistgeklickte Nachrichten aus der Region')" - " or contains(strong, 'Keine wichtigen News mehr verpassen')" - " or @rel='author' or em[@class='print']" - " or contains(a, 'Jetzt Angebot und Vorteile checken')" - " or contains(text(), 'Lesen Sie mehr Geschichten aus')" - " or contains(strong, 'Mehr wichtige Nachrichten aus')" - " or contains(strong, 'Täglich wissen, was in')" - " or contains(strong, 'Auch interessant')" - " or contains(strong, 'Auch interessant')" - " or contains(strong, 'Das könnte Sie auch interessieren')" - " or contains(strong, 'Lesen Sie auch')" - " or contains(strong, 'Mehr zu dem Thema')" - " or contains(strong, 'Mehr zum Thema')" - " or contains(strong, 'Lesen Sie dazu')" - " or contains(strong, 'Lesen Sie hier'))]" + "//div[@class='article-body']//p[not(not(text()) or @rel='author' or em[@class='print'] or position()=1)]" ) _summary_selector = XPath("//div[@class='article-body']//p[1]") _subheadline_selector = XPath( From 60be80ae6e06170ad0b1b48950d89cd31a11701d Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Tue, 6 Feb 2024 23:23:48 +0100 Subject: [PATCH 5/6] Slight change in Selectors --- src/fundus/publishers/de/braunschweiger_zeitung.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fundus/publishers/de/braunschweiger_zeitung.py b/src/fundus/publishers/de/braunschweiger_zeitung.py index a73de83c9..a80cc6aeb 100644 --- a/src/fundus/publishers/de/braunschweiger_zeitung.py +++ b/src/fundus/publishers/de/braunschweiger_zeitung.py @@ -18,9 +18,9 @@ class BSZParser(ParserProxy): class V1(BaseParser): _author_substitution_pattern: Pattern[str] = re.compile(r"FUNKE Mediengruppe") _paragraph_selector = XPath( - "//div[@class='article-body']//p[not(not(text()) or @rel='author' or em[@class='print'] or position()=1)]" + "//div[@class='article-body']//p[not(not(text()) or @rel='author' or em[@class='print'] or contains(@class, 'font-sans'))]" ) - _summary_selector = XPath("//div[@class='article-body']//p[1]") + _summary_selector = XPath("//div[@class='article-body']//p[contains(@class, 'font-sans')]") _subheadline_selector = XPath( "//div[@class='article-body']//h3[not(" "contains(text(), 'Alle Artikel der Serie')" From 460bbdd74cdb50504a45027053d2f4ffa5548f3f Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Tue, 6 Feb 2024 23:37:32 +0100 Subject: [PATCH 6/6] Adding Sitemaps to BSZ Sources --- src/fundus/publishers/de/__init__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py index a7f5bb31b..550739505 100644 --- a/src/fundus/publishers/de/__init__.py +++ b/src/fundus/publishers/de/__init__.py @@ -1,5 +1,7 @@ from datetime import datetime +from dateutil.rrule import MONTHLY, rrule + from fundus.publishers.base_objects import PublisherEnum, PublisherSpec from fundus.scraping.filter import regex_filter from fundus.scraping.html import NewsMap, RSSFeed, Sitemap @@ -201,6 +203,12 @@ class DE(PublisherEnum): sources=[ RSSFeed("https://www.braunschweiger-zeitung.de/rss"), NewsMap("https://www.braunschweiger-zeitung.de/sitemaps/news.xml"), + ] + + [ + Sitemap( + f"https://www.braunschweiger-zeitung.de/sitemaps/archive/sitemap-{d.year}-{str(d.month).zfill(2)}-p00.xml.gz" + ) + for d in reversed(list(rrule(MONTHLY, dtstart=datetime(2016, 9, 1), until=datetime.now()))) ], parser=BSZParser, )