From d910e391f555a86a3026c9b6b43ba8daf64c6437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Fri, 23 Feb 2024 16:28:05 +0100 Subject: [PATCH] chg: use puremagic instead of python-magic, removes dependency on libmagic --- playwrightcapture/capture.py | 38 ++++++++++++++++++++++++------------ poetry.lock | 34 ++++++++++++++++---------------- pyproject.toml | 4 ++-- 3 files changed, 45 insertions(+), 31 deletions(-) diff --git a/playwrightcapture/capture.py b/playwrightcapture/capture.py index 9fe15e6..f67bda7 100644 --- a/playwrightcapture/capture.py +++ b/playwrightcapture/capture.py @@ -20,16 +20,17 @@ from zipfile import ZipFile import dateparser -import magic import requests import urllib3 + from bs4 import BeautifulSoup from charset_normalizer import from_bytes from playwright._impl._errors import TargetClosedError from playwright.async_api import async_playwright, Frame, Error, Page, Download, Request from playwright.async_api import TimeoutError as PlaywrightTimeoutError from playwright_stealth import stealth_async # type: ignore[import-untyped] +from puremagic import PureError, from_string # type: ignore[import-untyped] from w3lib.html import strip_html5_whitespace from w3lib.url import canonicalize_url, safe_url_string @@ -132,8 +133,6 @@ def __init__(self, browser: BROWSER | None=None, device_name: str | None=None, self._locale: str = '' self._color_scheme: Literal['dark', 'light', 'no-preference', 'null'] | None = None - self.magic = magic.Magic(mime=True) - async def __aenter__(self) -> Capture: '''Launch the browser''' self._temp_harfile = NamedTemporaryFile(delete=False) @@ -479,15 +478,25 @@ async def handle_download(download: Download) -> None: self.wait_for_download -= 1 async def store_request(request: Request) -> None: - # This method is called on each request, to store the URL in a dict indexed by URL to get it back from the favicon fetcher + # This method is called on each request, to store the body (if it is an image) in a dict indexed by URL try: self.logger.debug(f'Storing request: {request.url}') if response := await request.response(): if response.ok: - body = await response.body() - mimetype = self.magic.from_buffer(body) - if mimetype.startswith('image'): - self._requests[request.url] = body + try: + body = await response.body() + except Exception as e: + self.logger.debug(f'Unable to get body for {request.url}: {e}') + else: + try: + if body: + mimetype = from_string(body, mime=True) + except PureError: + # unable to identify the mimetype + self.logger.debug(f'Unable to identify the mimetype for {request.url}') + else: + if mimetype.startswith('image'): + self._requests[request.url] = body except Exception as e: self.logger.warning(f'Unable to store request: {e}') @@ -1063,11 +1072,16 @@ def get_favicons(self, rendered_url: str, rendered_content: str) -> set[bytes]: favicon_response.raise_for_status() favicon = favicon_response.content if favicon: - mimetype = self.magic.from_buffer(favicon) - if mimetype.startswith('image'): - to_return.add(favicon) + try: + mimetype = from_string(favicon, mime=True) + except PureError: + # unable to identify the mimetype + self.logger.debug(f'Unable to identify the mimetype for favicon from {u}') else: - self.logger.warning(f'Unexpected mimetype for favicon from {u}: {mimetype}') + if mimetype.startswith('image'): + to_return.add(favicon) + else: + self.logger.warning(f'Unexpected mimetype for favicon from {u}: {mimetype}') self.logger.debug(f'Done with favicon from {u}.') except requests.HTTPError as e: self.logger.debug(f'Unable to fetch favicon from {u}: {e}') diff --git a/poetry.lock b/poetry.lock index 35ba903..09712f5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -516,6 +516,17 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "puremagic" +version = "1.20" +description = "Pure python implementation of magic file detection" +optional = false +python-versions = "*" +files = [ + {file = "puremagic-1.20-py3-none-any.whl", hash = "sha256:14817470dc1e3339356088b58576820efd12544a676c20d7d5e738ea1e06f852"}, + {file = "puremagic-1.20.tar.gz", hash = "sha256:c55c57369bd957bfe3af4765a66784eaaae77d697a6f12477174280e0abcbd07"}, +] + [[package]] name = "pydub" version = "0.25.1" @@ -592,17 +603,6 @@ files = [ [package.dependencies] six = ">=1.5" -[[package]] -name = "python-magic" -version = "0.4.27" -description = "File type identification using libmagic" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -files = [ - {file = "python-magic-0.4.27.tar.gz", hash = "sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b"}, - {file = "python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"}, -] - [[package]] name = "pytz" version = "2024.1" @@ -740,19 +740,19 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "setuptools" -version = "69.1.0" +version = "69.1.1" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-69.1.0-py3-none-any.whl", hash = "sha256:c054629b81b946d63a9c6e732bc8b2513a7c3ea645f11d0139a2191d735c60c6"}, - {file = "setuptools-69.1.0.tar.gz", hash = "sha256:850894c4195f09c4ed30dba56213bf7c3f21d86ed6bdaafb5df5972593bfc401"}, + {file = "setuptools-69.1.1-py3-none-any.whl", hash = "sha256:02fa291a0471b3a18b2b2481ed902af520c69e8ae0919c13da936542754b4c56"}, + {file = "setuptools-69.1.1.tar.gz", hash = "sha256:5c0806c7d9af348e6dd3777b4f4dbb42c7ad85b190104837488eab9a7c945cf8"}, ] [package.extras] docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] -testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.1)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "six" @@ -940,4 +940,4 @@ recaptcha = ["SpeechRecognition", "pydub", "requests"] [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "104039b2860cc843b4244e58c755616cb50f7a465e0afedc96970b15f85cb156" +content-hash = "07c0257c4930a607c8ad88093bdcc95cd28edbb3f00fdf9b639f5b3cf9dcb910" diff --git a/pyproject.toml b/pyproject.toml index 3c398e7..5139afe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,8 +29,8 @@ SpeechRecognition = {version = "^3.10.1", optional = true} pytz = {"version" = "^2024.1", python = "<3.9"} tzdata = "^2024.1" playwright-stealth = "^1.0.6" -setuptools = "^69.1.0" -python-magic = "^0.4.27" +setuptools = "^69.1.1" +puremagic = "^1.20" [tool.poetry.extras] recaptcha = ["requests", "pydub", "SpeechRecognition"]