From 629c7eaccfcc2f9ac25ff29643059ddb85865bb7 Mon Sep 17 00:00:00 2001 From: Iago Veloso Date: Thu, 27 May 2021 23:01:08 +0100 Subject: [PATCH] Improving use of db_catalogue --- README.md | 17 +++---- catalogue/__init__.py | 2 +- catalogue/main.py | 39 +++++++------- catalogue/metadata.py | 45 +++++++++-------- catalogue/model.py | 70 +++++++++++++++++--------- catalogue/tests/fixtures/filesystem.py | 2 +- catalogue/tests/test_utils.py | 5 +- 7 files changed, 104 insertions(+), 76 deletions(-) diff --git a/README.md b/README.md index 11d132e..21f6b48 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,9 @@ Move or copy your files according to your preferred format, using any combinatio ```bash $ catalogue --help -usage: catalogue [-h] [--version] [--verbose] - [--operation {move,copy,dry-run}] [--src SRC_PATH] - [--dst DST_PATH] [--unknown-folder UNKNOWN_FOLDER] - [--format PATH_FORMAT] +usage: catalogue [-h] [--version] [--verbose] [--operation {move,copy,dry-run}] + [--src SRC_PATH] [--dst DST_PATH] [--format PATH_FORMAT] + [--unknown-folder UNKNOWN_FOLDER] Organize your photos folder,. Example usage: @@ -29,11 +28,11 @@ optional arguments: Specify how to move files (copy, move or dry-run) --src SRC_PATH Path to the source directory. --dst DST_PATH Path to the destination directory. + --format PATH_FORMAT Customize how to structure the files in your catalogue. By default : '%Y/%m/%d/{filename} + All python strftime format codes are supported as well as {filename}, {basename}, {filename_extension}, {media_type} --unknown-folder UNKNOWN_FOLDER If provided will be used for media without creation date - It accepts same options as the format flag, strftime format will refer to current time - --format PATH_FORMAT Customize how to structure the files in your catalogue. e.g: '%Y/%m/%d/{filename} - All python strftime format codes are supported as well as {filename}, {basename}, {filename_extension}, {media_type} + It accepts same options as the format argument, strftime format codes will refer to current time instead of creation date ``` ## Requirements @@ -49,14 +48,14 @@ The easier way to run this project is using docker: Run example; notice `source` and `my_catalogue` need to be replace with your destinations: - docker run --rm -v $(pwd)/source:/input:ro -v $(pwd)/my_catalogue:/output iago1460/catalogue:1.2.5 --src /input --dst /output --operation copy + docker run --rm -v $(pwd)/source:/input:ro -v $(pwd)/my_catalogue:/output iago1460/catalogue:1.2.6 --src /input --dst /output --operation copy ### In a virtual environment virtualenv venv source venv/bin/activate - pip3 install https://github.com/iago1460/photo-cataloguer/archive/1.2.5.zip + pip3 install https://github.com/iago1460/photo-cataloguer/archive/1.2.6.zip catalogue --help diff --git a/catalogue/__init__.py b/catalogue/__init__.py index 5f8e8a7..b56df3a 100644 --- a/catalogue/__init__.py +++ b/catalogue/__init__.py @@ -3,4 +3,4 @@ if sys.version_info < (3, 9): raise RuntimeError("Python 3.9 or later is required") -__version__ = "1.0" +__version__ = "1.2.6" diff --git a/catalogue/main.py b/catalogue/main.py index 8409281..7ad7639 100644 --- a/catalogue/main.py +++ b/catalogue/main.py @@ -74,24 +74,24 @@ def main(): required=False, default=None, ) - parser.add_argument( - "--unknown-folder", - help="If provided will be used for media without creation date\n" - "It accepts same options as the format flag, strftime format will refer to current time", - dest="unknown_folder", - type=str, - required=False, - default=None, - ) parser.add_argument( "--format", - help="Customize how to structure the files in your catalogue. e.g: '%%Y/%%m/%%d/{filename}\n" + help="Customize how to structure the files in your catalogue. By default : '%%Y/%%m/%%d/{filename}\n" "All python strftime format codes are supported as well as {filename}, {basename}, {filename_extension}, {media_type}", dest="path_format", type=str, required=False, default="%Y/%m/%d/{filename}", ) + parser.add_argument( + "--unknown-folder", + help="If provided will be used for media without creation date\n" + "It accepts same options as the format argument, strftime format codes will refer to current time instead of creation date", + dest="unknown_folder", + type=str, + required=False, + default=None, + ) args = parser.parse_args() start_dt = datetime.now() @@ -120,11 +120,11 @@ def main(): duplicated_files_list = src_catalogue.detect_duplicates_with(dst_catalogue) if duplicated_files_list: - logging.info(f"Ignoring some duplicates files which are already present") + logging.info(f"Ignoring some duplicate files which are already present") if args.verbose: for files_list in duplicated_files_list: logging.info( - " * {files}".format( + " - {files}".format( files=", ".join(sorted(map(escape, files_list))) ) ) @@ -140,7 +140,7 @@ def main(): ) for files_list in duplicated_list_of_files_to_import: logging.info( - " * {files}".format(files=", ".join(sorted(map(escape, files_list)))) + " - {files}".format(files=", ".join(sorted(map(escape, files_list)))) ) # Remove each first file from the list so it gets imported duplicated_list_of_files_to_import = set( @@ -168,10 +168,7 @@ def main(): logging.debug(f"Skipping '{media_type}' file {file.path}") continue - if ( - file in duplicated_files - or file in duplicated_list_of_files_to_import - ): + if file in duplicated_files or file in duplicated_list_of_files_to_import: logging.debug(f"Skipping duplicated file {file.path}") continue @@ -208,7 +205,9 @@ def main(): ) imported_files.append(processed_file) - if args.operation != Operation.DRY_RUN: # shouldn't save if dry run (data is messed up too) + if ( + args.operation != Operation.DRY_RUN + ): # shouldn't save if dry run (data is messed up too) logging.info("Saving catalogue...") dst_catalogue.save_db() @@ -235,10 +234,12 @@ def generate_filename(file, path_format, dt, parent_folder, media_type): def process_file(file, operation, dst_catalogue, dst_file_path): path_available = dst_catalogue.is_path_available(dst_file_path) if not path_available: + logging.debug(f"Path {dst_file_path} not available, renaming file") dst_file_path = dst_catalogue.find_new_path(dst_file_path) if operation == Operation.DRY_RUN: - logging.info(f"dry-run: {file.path} -> {dst_file_path}") + collision_indicator = ' *' if not path_available else '' + logging.info(f"dry-run: {file.path} -> {dst_file_path}{collision_indicator}") file.path = dst_file_path dst_catalogue.add_file(file) # needed so path_available is more accurate return None diff --git a/catalogue/metadata.py b/catalogue/metadata.py index 54fc353..65b8ec8 100644 --- a/catalogue/metadata.py +++ b/catalogue/metadata.py @@ -29,32 +29,35 @@ def _normalize_datetime_format(exif_dt_field): except ValueError: return exif_dt except TypeError: - logging.info(f'Cannot parse "{exif_dt}", {exif_dt_field}') + logging.debug(f'Cannot parse {exif_dt=}') return None def _extract_created_date_from_exif(exif): - created_data = exif.get("DateTimeOriginal") or exif.get("DateTime") - - created_data = _normalize_datetime_format(created_data) - - if created_data: - try: - return dateutil.parser.parse(created_data) - except ValueError as e: - if e.args[0] == "Unknown string format: %s": - unknown_date = e.args[1] - logging.debug(f"Attempting to parse unknown date {unknown_date}") - match = re.search( - r"(?P\d{4})/(?P\d{2})/(?P\d{2})", unknown_date + exif_dt_field = exif.get("DateTimeOriginal") or exif.get("DateTime") + + if not exif_dt_field: + return None + + created_data = _normalize_datetime_format(exif_dt_field) + if not created_data: + return None + + try: + return dateutil.parser.parse(created_data) + except ValueError as e: + if e.args[0] == "Unknown string format: %s": + unknown_date = e.args[1] + logging.debug(f"Attempting to parse unknown date {unknown_date}") + match = re.search( + r"(?P\d{4})/(?P\d{2})/(?P\d{2})", unknown_date + ) + if match: + year, month, day = match.groups() + return datetime.datetime( + year=int(year), month=int(month), day=int(day) ) - if match: - year, month, day = match.groups() - return datetime.datetime( - year=int(year), month=int(month), day=int(day) - ) - raise e - return None + raise e IMAGE_PATH_REGEXES = ( diff --git a/catalogue/model.py b/catalogue/model.py index ff30144..5193ff2 100644 --- a/catalogue/model.py +++ b/catalogue/model.py @@ -1,17 +1,17 @@ -import json import hashlib +import json import logging -import magic import os +from contextlib import suppress + +import magic import shutil +from datetime import datetime, timezone, timedelta from itertools import chain from pathlib import PurePath, Path -from contextlib import suppress -from catalogue.metadata import get_image_creation_date, get_path_creation_date -from datetime import datetime, timezone, timedelta from catalogue import __version__ - +from catalogue.metadata import get_image_creation_date, get_path_creation_date CATALOGUE_EXPIRY_DELTA = timedelta(days=1) @@ -138,7 +138,7 @@ def asdict(self): class Catalogue: root_path = None - last_update = None + creation_date = None _files = None _files_by_path = None _files_by_size = None @@ -149,10 +149,11 @@ class Catalogue: def files(self): return tuple(self._files.copy()) - def __init__(self, root_path: Path, files=None, last_update=None): + def __init__( + self, root_path: Path, files=None, creation_date=None): self.root_path = root_path self._files = [] - self.last_update = last_update + self.creation_date = creation_date self._files_by_path = {} self._files_by_size = {} self._files_by_short_hash = {} @@ -163,23 +164,27 @@ def __init__(self, root_path: Path, files=None, last_update=None): @classmethod def load(cls, path): db_data = cls._load_data_from_database(path) - if db_data: - last_update = datetime.fromisoformat(db_data["last_update"]) - if ( - db_data["version"] == __version__ - or datetime.now(timezone.utc) - last_update < CATALOGUE_EXPIRY_DELTA + if db_data and db_data.get("version") == __version__: + creation_date = datetime.fromisoformat(db_data["creation_date"]) + + if datetime.now( + timezone.utc + ) - creation_date < CATALOGUE_EXPIRY_DELTA and len(db_data['files']) == count_number_of_files( + path ): - logging.debug( - "Database file seems suitable, using it to speed up things!" - ) + logging.debug("Database found, using it to speed up things!") files = [ File(**{**file_data, "path": path.joinpath(file_data["path"])}) for file_data in db_data["files"] ] - return cls(path, files=files, last_update=last_update) - logging.debug("Database seems outdated, reverting to scan.") + return cls( + path, + files=files, + creation_date=creation_date, + ) + logging.debug(f"Database found but seems outdated, ignoring it") - logging.debug(f"Database not found, scanning {path}...") + logging.debug(f"Scanning {path}...") return cls._generate_catalogue_from_scan(path) @classmethod @@ -209,7 +214,11 @@ def _generate_catalogue_from_scan(cls, path): logging.warning("Cannot read %s: %s", full_path, e) continue files.append(File(path=file_path, size=file_size)) - return cls(root_path=path, files=files, last_update=datetime.now(timezone.utc)) + return cls( + root_path=path, + files=files, + creation_date=datetime.now(timezone.utc), + ) def notify(self, file, field, new_value): """ @@ -224,7 +233,9 @@ def notify(self, file, field, new_value): with suppress(ValueError): self._files_by_size.setdefault(file.size, []).remove(file) with suppress(ValueError): - self._files_by_short_hash.setdefault(file._short_hash, []).remove(file) + self._files_by_short_hash.setdefault(file._short_hash, []).remove( + file + ) with suppress(ValueError): self._files_by_hash.setdefault(file._hash, []).remove(file) return @@ -298,12 +309,17 @@ def file_asdict(file): return { "version": __version__, - "last_update": self.last_update.isoformat(), + "creation_date": self.creation_date.isoformat(), "files": [file_asdict(file) for file in self._files], } def save_db(self): db_path = self.root_path.joinpath(DATABASE_LOCATION) + + # Adding db file since we are going to save it below and will mess file counting next time + if self.is_path_available(db_path): + self.add_file(File(path=db_path)) + db_data = self.asdict() with open(db_path, "w") as db_file: # json.dump(db_data, db_file, indent=4) # debug @@ -323,6 +339,14 @@ def split_extension(name): return name, "" +def count_number_of_files(path): + file_count = 0 + for dirpath, dirnames, filenames in os.walk(path): + for _ in filenames: + file_count += 1 + return file_count + + def _chunk_reader(fobj, chunk_size=1024): """Generator that reads a file in chunks of bytes""" while True: diff --git a/catalogue/tests/fixtures/filesystem.py b/catalogue/tests/fixtures/filesystem.py index 39de937..99732e0 100644 --- a/catalogue/tests/fixtures/filesystem.py +++ b/catalogue/tests/fixtures/filesystem.py @@ -20,5 +20,5 @@ def catalogue(): assert TEST_CATALOGUE_PATH.exists() return Catalogue( TEST_CATALOGUE_PATH, - last_update=datetime(2021, 1, 1, 22, 00, 30, tzinfo=timezone.utc), + creation_date=datetime(2021, 1, 1, 22, 00, 30, tzinfo=timezone.utc), ) diff --git a/catalogue/tests/test_utils.py b/catalogue/tests/test_utils.py index e3b0435..5346ae2 100644 --- a/catalogue/tests/test_utils.py +++ b/catalogue/tests/test_utils.py @@ -1,6 +1,7 @@ import pytest from catalogue.model import split_extension +from catalogue import __version__ @pytest.mark.parametrize( @@ -40,8 +41,8 @@ def test_catalogue_asdict(catalogue, text_file): "files": [ {"hash": None, "path": "folder/text.txt", "short_hash": None, "size": None} ], - "last_update": "2021-01-01T22:00:30+00:00", - "version": "1.0", + "creation_date": "2021-01-01T22:00:30+00:00", + "version": __version__, }