From e4514f9055e2052f0175c4312ae55281beefe14a Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 22 Feb 2024 18:08:45 +0100 Subject: [PATCH 1/2] adjust paragraph selector to exclude related content --- src/fundus/publishers/us/cnbc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/fundus/publishers/us/cnbc.py b/src/fundus/publishers/us/cnbc.py index 9453a6818..166bafb34 100644 --- a/src/fundus/publishers/us/cnbc.py +++ b/src/fundus/publishers/us/cnbc.py @@ -2,6 +2,7 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute from fundus.parser.utility import ( @@ -15,7 +16,7 @@ class CNBCParser(ParserProxy): class V1(BaseParser): _subheadline_selector: CSSSelector = CSSSelector("div[data-module = 'ArticleBody'] > h2") - _paragraph_selector: CSSSelector = CSSSelector("div.group > p") + _paragraph_selector: CSSSelector = XPath("//div[@data-module='ArticleBody'] / div[@class='group'] / p[text()]") _key_points_selector: CSSSelector = CSSSelector("div.RenderKeyPoints-list li") @attribute From 58dd67a13ac6a23ec5df44c9d777cf4a58c5e727 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 22 Feb 2024 18:12:11 +0100 Subject: [PATCH 2/2] fix mypy --- src/fundus/publishers/us/cnbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fundus/publishers/us/cnbc.py b/src/fundus/publishers/us/cnbc.py index 166bafb34..ad5aa59ce 100644 --- a/src/fundus/publishers/us/cnbc.py +++ b/src/fundus/publishers/us/cnbc.py @@ -16,7 +16,7 @@ class CNBCParser(ParserProxy): class V1(BaseParser): _subheadline_selector: CSSSelector = CSSSelector("div[data-module = 'ArticleBody'] > h2") - _paragraph_selector: CSSSelector = XPath("//div[@data-module='ArticleBody'] / div[@class='group'] / p[text()]") + _paragraph_selector: XPath = XPath("//div[@data-module='ArticleBody'] / div[@class='group'] / p[text()]") _key_points_selector: CSSSelector = CSSSelector("div.RenderKeyPoints-list li") @attribute