Skip to content

Commit

Permalink
chg: use puremagic instead of python-magic, removes dependency on lib…
Browse files Browse the repository at this point in the history
…magic
  • Loading branch information
Rafiot committed Feb 23, 2024
1 parent 18daba1 commit d910e39
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 31 deletions.
38 changes: 26 additions & 12 deletions playwrightcapture/capture.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,17 @@
from zipfile import ZipFile

import dateparser
import magic
import requests
import urllib3


from bs4 import BeautifulSoup
from charset_normalizer import from_bytes
from playwright._impl._errors import TargetClosedError
from playwright.async_api import async_playwright, Frame, Error, Page, Download, Request
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
from playwright_stealth import stealth_async # type: ignore[import-untyped]
from puremagic import PureError, from_string # type: ignore[import-untyped]
from w3lib.html import strip_html5_whitespace
from w3lib.url import canonicalize_url, safe_url_string

Expand Down Expand Up @@ -132,8 +133,6 @@ def __init__(self, browser: BROWSER | None=None, device_name: str | None=None,
self._locale: str = ''
self._color_scheme: Literal['dark', 'light', 'no-preference', 'null'] | None = None

self.magic = magic.Magic(mime=True)

async def __aenter__(self) -> Capture:
'''Launch the browser'''
self._temp_harfile = NamedTemporaryFile(delete=False)
Expand Down Expand Up @@ -479,15 +478,25 @@ async def handle_download(download: Download) -> None:
self.wait_for_download -= 1

async def store_request(request: Request) -> None:
# This method is called on each request, to store the URL in a dict indexed by URL to get it back from the favicon fetcher
# This method is called on each request, to store the body (if it is an image) in a dict indexed by URL
try:
self.logger.debug(f'Storing request: {request.url}')
if response := await request.response():
if response.ok:
body = await response.body()
mimetype = self.magic.from_buffer(body)
if mimetype.startswith('image'):
self._requests[request.url] = body
try:
body = await response.body()
except Exception as e:
self.logger.debug(f'Unable to get body for {request.url}: {e}')
else:
try:
if body:
mimetype = from_string(body, mime=True)
except PureError:
# unable to identify the mimetype
self.logger.debug(f'Unable to identify the mimetype for {request.url}')
else:
if mimetype.startswith('image'):
self._requests[request.url] = body
except Exception as e:
self.logger.warning(f'Unable to store request: {e}')

Expand Down Expand Up @@ -1063,11 +1072,16 @@ def get_favicons(self, rendered_url: str, rendered_content: str) -> set[bytes]:
favicon_response.raise_for_status()
favicon = favicon_response.content
if favicon:
mimetype = self.magic.from_buffer(favicon)
if mimetype.startswith('image'):
to_return.add(favicon)
try:
mimetype = from_string(favicon, mime=True)
except PureError:
# unable to identify the mimetype
self.logger.debug(f'Unable to identify the mimetype for favicon from {u}')
else:
self.logger.warning(f'Unexpected mimetype for favicon from {u}: {mimetype}')
if mimetype.startswith('image'):
to_return.add(favicon)
else:
self.logger.warning(f'Unexpected mimetype for favicon from {u}: {mimetype}')
self.logger.debug(f'Done with favicon from {u}.')
except requests.HTTPError as e:
self.logger.debug(f'Unable to fetch favicon from {u}: {e}')
Expand Down
34 changes: 17 additions & 17 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ SpeechRecognition = {version = "^3.10.1", optional = true}
pytz = {"version" = "^2024.1", python = "<3.9"}
tzdata = "^2024.1"
playwright-stealth = "^1.0.6"
setuptools = "^69.1.0"
python-magic = "^0.4.27"
setuptools = "^69.1.1"
puremagic = "^1.20"

[tool.poetry.extras]
recaptcha = ["requests", "pydub", "SpeechRecognition"]
Expand Down

0 comments on commit d910e39

Please sign in to comment.