Skip to content

Commit

Permalink
Merge pull request #341 from flairNLP/add-business-insider
Browse files Browse the repository at this point in the history
Add Business Insider
  • Loading branch information
MaxDall authored Feb 6, 2024
2 parents c552f60 + b562604 commit c18f234
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 0 deletions.
15 changes: 15 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,21 @@
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>BusinessInsider</code>
</td>
<td>
<div>Business Insider</div>
</td>
<td>
<a href="https://www.businessinsider.de/">
<span>www.businessinsider.de</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>DW</code>
Expand Down
11 changes: 11 additions & 0 deletions src/fundus/publishers/de/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from .berliner_zeitung import BerlinerZeitungParser
from .bild import BildParser
from .business_insider import BusinessInsiderParser
from .die_welt import DieWeltParser
from .die_zeit import DieZeitParser
from .dw import DWParser
Expand Down Expand Up @@ -190,3 +191,13 @@ class DE(PublisherEnum):
sources=[NewsMap("https://www.waz.de/sitemaps/news.xml")],
parser=WAZParser,
)

BusinessInsider = PublisherSpec(
name="Business Insider",
domain="https://www.businessinsider.de/",
sources=[
NewsMap("https://www.businessinsider.de/news-sitemap.xml"),
Sitemap("https://www.businessinsider.de/sitemap_index.xml"),
],
parser=BusinessInsiderParser,
)
60 changes: 60 additions & 0 deletions src/fundus/publishers/de/business_insider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import datetime
from typing import List, Optional

from lxml.cssselect import CSSSelector
from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
)


class BusinessInsiderParser(ParserProxy):
class V1(BaseParser):
_summary_selector = CSSSelector("article div.bi-bulletpoints > p")
_subheadline_selector = CSSSelector("article h2")

_paragraph_selector = XPath(
"""
//article
//div[contains(@class, 'article-body')]
//p[
not(
ancestor::*[@class='bi-bulletpoints'] or
mark[@class='has-inline-color has-cyan-bluish-gray-color'] or
@class='has-text-align-right'
)
]
"""
)

@attribute
def body(self) -> ArticleBody:
return extract_article_body_with_selector(
self.precomputed.doc,
summary_selector=self._summary_selector,
subheadline_selector=self._subheadline_selector,
paragraph_selector=self._paragraph_selector,
)

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"))

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.ld.bf_search("headline")

@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(self.precomputed.meta.get("keywords")) or generic_topic_parsing(
self.precomputed.ld.bf_search("keywords")
)
15 changes: 15 additions & 0 deletions tests/resources/parser/test_data/de/BusinessInsider.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"V1": {
"authors": [
"Matthew Loh"
],
"publishing_date": "2024-01-29 19:00:45+00:00",
"title": "Wie groß ist Evergrande, Chinas strauchelnder Immobilienriese?",
"topics": [
"China",
"Immobilien",
"Pleite",
"Schulden"
]
}
}
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/resources/parser/test_data/de/meta.info
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
"url": "https://www.bild.de/sport/fussball/fortuna-duesseldorf/bubi-bomber-wieder-da-thioune-fordert-geduld-mit-niemiec-83936220.bild.html",
"crawl_date": "2023-05-15 13:55:04.823203"
},
"BusinessInsider_2024_01_29.html.gz": {
"url": "https://www.businessinsider.de/wirtschaft/international-business/wie-gross-ist-evergrande-chinas-sterbender-immobilienriese/",
"crawl_date": "2024-01-29 22:53:02.986279"
},
"DW_2023_04_28.html.gz": {
"url": "https://www.dw.com/de/ukrainische-gegenoffensive-ziele-chancen-risiken/a-65464327?maca=de-rss-de-all-1119-xml-mrss",
"crawl_date": "2023-04-28 20:25:18.143350"
Expand Down

0 comments on commit c18f234

Please sign in to comment.