Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Taipei Times #639

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -1400,6 +1400,40 @@
</table>


## TW-Publishers

<table class="publishers tw">
<thead>
<tr>
<th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>Name&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>Missing&#160;Attributes</th>
<th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
</tr>
</thead>
<tbody>
<tr>
<td>
<code>TaipeiTimes</code>
</td>
<td>
<div>Taipei Times</div>
</td>
<td>
<a href="https://www.taipeitimes.com/">
<span>www.taipeitimes.com</span>
</a>
</td>
<td>
<code>topics</code>
</td>
<td>&#160;</td>
</tr>
</tbody>
</table>


## UK-Publishers

<table class="publishers uk">
Expand Down
2 changes: 2 additions & 0 deletions src/fundus/publishers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from fundus.publishers.na import NA
from fundus.publishers.no import NO
from fundus.publishers.tr import TR
from fundus.publishers.tw import TW
from fundus.publishers.uk import UK
from fundus.publishers.us import US

Expand Down Expand Up @@ -69,3 +70,4 @@ class PublisherCollection(metaclass=PublisherCollectionMeta):
ca = CA
es = ES
jp = JP
tw = TW
61 changes: 61 additions & 0 deletions src/fundus/publishers/tw/TaipeiTimes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import datetime
import re
from typing import List, Optional

from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
image_extraction,
)


class TaipeiTimesParser(ParserProxy):
class V1(BaseParser):
_paragraph_selector = XPath(
r"//div[@class='archives']/p[not(re:test(text(), '(?i)^(by.*)\s*$'))]",
namespaces={"re": "http://exslt.org/regular-expressions"},
)
_summary_selector = XPath("//div[@class='archives']/h2")
_author_selector = XPath("//div[@class='archives']//div[@class='name']/text()")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
summary_selector=self._summary_selector,
)

@attribute
def authors(self) -> List[str]:
author_selection = self._author_selector(self.precomputed.doc)
if not author_selection:
return []
else:
selection = re.sub(
r"(?is)(^by|/.*|staff reporter|(,?\s*with\s*)?staff writer.*)", "", author_selection[0]
)
return generic_author_parsing(selection, split_on=[r"\s+and\s+"])

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.ld.bf_search("headline")

@attribute
def images(self) -> List[Image]:
return image_extraction(
doc=self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
upper_boundary_selector=XPath("//div[@class='archives']"),
image_selector=XPath("//div[@class='imgboxa']//img"),
caption_selector=XPath("./ancestor::div[@class='imgboxa']//h1"),
author_selector=XPath("./ancestor::div[@class='imgboxa']//p"),
)
17 changes: 17 additions & 0 deletions src/fundus/publishers/tw/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.tw.TaipeiTimes import TaipeiTimesParser
from fundus.scraping.url import NewsMap, Sitemap


class TW(metaclass=PublisherGroup):
TaipeiTimes = Publisher(
name="Taipei Times",
domain="https://www.taipeitimes.com/",
parser=TaipeiTimesParser,
sources=[
Sitemap(
"https://www.taipeitimes.com/sitemapIndex.xml",
),
NewsMap("https://www.taipeitimes.com/sitemap/sitemap.xml"),
],
)
58 changes: 58 additions & 0 deletions tests/resources/parser/test_data/tw/TaipeiTimes.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
{
"V1": {
"authors": [
"AFP, MANILA"
],
"body": {
"summary": [
"CHINESE AGGRESSION: Manila accused Beijing’s ‘maritime militia’ of sideswiping a Philippine civilian patrol boat, which sustained minor damage"
],
"sections": [
{
"headline": [],
"paragraphs": [
"Thousands of US and Filipino troops yesterday launched joint exercises in the northern and western Philippines, after China held huge drills around Taiwan and a Chinese vessel collided with a Philippine patrol boat.",
"The annual Kamandag, or Venom, exercises are focused on defending the north coast of the Philippines’ main island of Luzon, which lies several hundreds kilometers from Taiwan.",
"Beijing considers Taiwan part of its territory and has vowed it would never rule out using force to take it, calling Monday’s drills a “stern warning” to “separatist” forces in Taiwan.",
"The joint US-Philippine exercises, which run until Wednesday next week, also come days after a collision between a Chinese and a Philippine vessel in the South China Sea. It was the latest in a series of confrontations between the two countries in the strategic waterway claimed almost entirely by Beijing.",
"Philippine Marine Corps Commandant Major General Arturo Rojas said at yesterday’s opening ceremony in Manila that Kamandag was long planned and had “nothing to do with whatever is happening in the region.”",
"The drills’ primary focus would be live-fire exercises along Luzon’s north coast, while other activities would be conducted on tiny Philippine islands between Luzon and Taiwan.",
"“It’s a coastal defense doctrine. The doctrine says that a would-be aggressor might be directed towards our territory,” Philippine exercise director Brigadier General Vicente Blanco told reporters.",
"“We are not exercising to join the fight” over Taiwan, he added.",
"US Marines exercise representative Colonel Stuart Glenn said the drills were aimed at helping the US and its allies respond to “any crisis or contingencies.”",
"The western Philippine island of Palawan, facing the South China Sea, is also to host part of the drills. The US and Philippines are fielding more than 1,000 participants each, while smaller numbers of Australian, British, Japanese and South Korean forces are also taking part.",
"An amphibious landing and training on how to defend against chemical and biological warfare are also among the activities planned, a press kit said.",
"As the war games began yesterday, the Philippine government announced that the BRP Datu Cabaylo, a civilian patrol vessel, had sustained minor damage on Friday last week when it was “deliberately sideswiped” by a “Chinese maritime militia” vessel.",
"The collision dented the 30m vessel’s front right section, the Philippine Bureau of Fisheries and Aquatic Resources said in a statement.",
"It took place about 9.3km from Thitu Island (Jhongye Island, 中業島), a Philippine-garrisoned island in the Spratly group (Nansha Islands, 南沙群島).",
"Prior to the collision, the Chinese vessel also “conducted dangerous maneuvers and tried to block the path” of the Philippine boat, which was conducting routine patrol, the bureau said.",
"The crew were unhurt and later sailed the vessel to Thitu.",
"“What they did to us is against international law and violates our sovereign rights in the West Philippine Sea,” fisheries bureau spokesman Nazario Briguera said, using the Philippine term for its claimed sections of the South China Sea.",
"He said the Datu Cabaylo was the third vessel owned by the bureau that was damaged in clashes with Chinese vessels this year."
]
}
]
},
"images": [
{
"versions": [
{
"url": "https://www.taipeitimes.com/images/2024/10/16/P01-241016-317.jpg",
"query_width": null,
"size": null,
"type": "image/jpeg"
}
],
"is_cover": false,
"description": null,
"caption": "Philippine Marine Corps Commandant Major-General Arturo Roja, second left, US Marines exercise representative Colonel Stuart Glenn, left, and Philippine Marine Corps and exercise director Brigadier General Vicente Blanco, second right, take part in the opening ceremony of the annual Kamandag joint military exercises at the Philippine Marines officers’ club at Fort Bonifacio, Metro Manila, yesterday.",
"authors": [
"AFP"
],
"position": 170
}
],
"publishing_date": "2024-10-16 00:00:00+08:00",
"title": "US, Manila launch war games after PRC’s Taiwan drills"
}
}
Binary file not shown.
6 changes: 6 additions & 0 deletions tests/resources/parser/test_data/tw/meta.info
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"TaipeiTimes_2024_10_15.html.gz": {
"url": "https://www.taipeitimes.com/News/front/archives/2024/10/16/2003825372",
"crawl_date": "2024-10-15 22:35:38.596680"
}
}
Loading