-
Notifications
You must be signed in to change notification settings - Fork 83
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #674 from flairNLP/add-taipei-times
Add `Taipei Times`
- Loading branch information
Showing
7 changed files
with
178 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import datetime | ||
import re | ||
from typing import List, Optional | ||
|
||
from lxml.etree import XPath | ||
|
||
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute | ||
from fundus.parser.utility import ( | ||
extract_article_body_with_selector, | ||
generic_author_parsing, | ||
generic_date_parsing, | ||
image_extraction, | ||
) | ||
|
||
|
||
class TaipeiTimesParser(ParserProxy): | ||
class V1(BaseParser): | ||
_paragraph_selector = XPath( | ||
r"//div[@class='archives']/p[not(re:test(text(), '(?i)^(by.*)\s*$'))]", | ||
namespaces={"re": "http://exslt.org/regular-expressions"}, | ||
) | ||
_summary_selector = XPath("//div[@class='archives']/h2") | ||
_author_selector = XPath("//div[@class='archives']//div[@class='name']/text()") | ||
|
||
@attribute | ||
def body(self) -> Optional[ArticleBody]: | ||
return extract_article_body_with_selector( | ||
self.precomputed.doc, | ||
paragraph_selector=self._paragraph_selector, | ||
summary_selector=self._summary_selector, | ||
) | ||
|
||
@attribute | ||
def authors(self) -> List[str]: | ||
author_selection = self._author_selector(self.precomputed.doc) | ||
if not author_selection: | ||
return [] | ||
else: | ||
selection = re.sub( | ||
r"(?is)(^by|/.*|staff reporter|(,?\s*with\s*)?staff writer.*)", "", author_selection[0] | ||
) | ||
return generic_author_parsing(selection, split_on=[r"\s+and\s+"]) | ||
|
||
@attribute | ||
def publishing_date(self) -> Optional[datetime.datetime]: | ||
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) | ||
|
||
@attribute | ||
def title(self) -> Optional[str]: | ||
return self.precomputed.ld.bf_search("headline") | ||
|
||
@attribute | ||
def images(self) -> List[Image]: | ||
return image_extraction( | ||
doc=self.precomputed.doc, | ||
paragraph_selector=self._paragraph_selector, | ||
upper_boundary_selector=XPath("//div[@class='archives']"), | ||
image_selector=XPath("//div[@class='imgboxa']//img"), | ||
caption_selector=XPath("./ancestor::div[@class='imgboxa']//h1"), | ||
author_selector=XPath("./ancestor::div[@class='imgboxa']//p"), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from fundus.publishers.base_objects import Publisher, PublisherGroup | ||
from fundus.publishers.tw.TaipeiTimes import TaipeiTimesParser | ||
from fundus.scraping.url import NewsMap, Sitemap | ||
|
||
|
||
class TW(metaclass=PublisherGroup): | ||
TaipeiTimes = Publisher( | ||
name="Taipei Times", | ||
domain="https://www.taipeitimes.com/", | ||
parser=TaipeiTimesParser, | ||
sources=[ | ||
Sitemap( | ||
"https://www.taipeitimes.com/sitemapIndex.xml", | ||
), | ||
NewsMap("https://www.taipeitimes.com/sitemap/sitemap.xml"), | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
{ | ||
"V1": { | ||
"authors": [ | ||
"AFP, MANILA" | ||
], | ||
"body": { | ||
"summary": [ | ||
"CHINESE AGGRESSION: Manila accused Beijing’s ‘maritime militia’ of sideswiping a Philippine civilian patrol boat, which sustained minor damage" | ||
], | ||
"sections": [ | ||
{ | ||
"headline": [], | ||
"paragraphs": [ | ||
"Thousands of US and Filipino troops yesterday launched joint exercises in the northern and western Philippines, after China held huge drills around Taiwan and a Chinese vessel collided with a Philippine patrol boat.", | ||
"The annual Kamandag, or Venom, exercises are focused on defending the north coast of the Philippines’ main island of Luzon, which lies several hundreds kilometers from Taiwan.", | ||
"Beijing considers Taiwan part of its territory and has vowed it would never rule out using force to take it, calling Monday’s drills a “stern warning” to “separatist” forces in Taiwan.", | ||
"The joint US-Philippine exercises, which run until Wednesday next week, also come days after a collision between a Chinese and a Philippine vessel in the South China Sea. It was the latest in a series of confrontations between the two countries in the strategic waterway claimed almost entirely by Beijing.", | ||
"Philippine Marine Corps Commandant Major General Arturo Rojas said at yesterday’s opening ceremony in Manila that Kamandag was long planned and had “nothing to do with whatever is happening in the region.”", | ||
"The drills’ primary focus would be live-fire exercises along Luzon’s north coast, while other activities would be conducted on tiny Philippine islands between Luzon and Taiwan.", | ||
"“It’s a coastal defense doctrine. The doctrine says that a would-be aggressor might be directed towards our territory,” Philippine exercise director Brigadier General Vicente Blanco told reporters.", | ||
"“We are not exercising to join the fight” over Taiwan, he added.", | ||
"US Marines exercise representative Colonel Stuart Glenn said the drills were aimed at helping the US and its allies respond to “any crisis or contingencies.”", | ||
"The western Philippine island of Palawan, facing the South China Sea, is also to host part of the drills. The US and Philippines are fielding more than 1,000 participants each, while smaller numbers of Australian, British, Japanese and South Korean forces are also taking part.", | ||
"An amphibious landing and training on how to defend against chemical and biological warfare are also among the activities planned, a press kit said.", | ||
"As the war games began yesterday, the Philippine government announced that the BRP Datu Cabaylo, a civilian patrol vessel, had sustained minor damage on Friday last week when it was “deliberately sideswiped” by a “Chinese maritime militia” vessel.", | ||
"The collision dented the 30m vessel’s front right section, the Philippine Bureau of Fisheries and Aquatic Resources said in a statement.", | ||
"It took place about 9.3km from Thitu Island (Jhongye Island, 中業島), a Philippine-garrisoned island in the Spratly group (Nansha Islands, 南沙群島).", | ||
"Prior to the collision, the Chinese vessel also “conducted dangerous maneuvers and tried to block the path” of the Philippine boat, which was conducting routine patrol, the bureau said.", | ||
"The crew were unhurt and later sailed the vessel to Thitu.", | ||
"“What they did to us is against international law and violates our sovereign rights in the West Philippine Sea,” fisheries bureau spokesman Nazario Briguera said, using the Philippine term for its claimed sections of the South China Sea.", | ||
"He said the Datu Cabaylo was the third vessel owned by the bureau that was damaged in clashes with Chinese vessels this year." | ||
] | ||
} | ||
] | ||
}, | ||
"images": [ | ||
{ | ||
"versions": [ | ||
{ | ||
"url": "https://www.taipeitimes.com/images/2024/10/16/P01-241016-317.jpg", | ||
"query_width": null, | ||
"size": null, | ||
"type": "image/jpeg" | ||
} | ||
], | ||
"is_cover": false, | ||
"description": null, | ||
"caption": "Philippine Marine Corps Commandant Major-General Arturo Roja, second left, US Marines exercise representative Colonel Stuart Glenn, left, and Philippine Marine Corps and exercise director Brigadier General Vicente Blanco, second right, take part in the opening ceremony of the annual Kamandag joint military exercises at the Philippine Marines officers’ club at Fort Bonifacio, Metro Manila, yesterday.", | ||
"authors": [ | ||
"AFP" | ||
], | ||
"position": 170 | ||
} | ||
], | ||
"publishing_date": "2024-10-16 00:00:00+08:00", | ||
"title": "US, Manila launch war games after PRC’s Taiwan drills" | ||
} | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"TaipeiTimes_2024_10_15.html.gz": { | ||
"url": "https://www.taipeitimes.com/News/front/archives/2024/10/16/2003825372", | ||
"crawl_date": "2024-10-15 22:35:38.596680" | ||
} | ||
} |