Skip to content

Commit

Permalink
Improving use of db_catalogue
Browse files Browse the repository at this point in the history
  • Loading branch information
Iago Veloso committed May 27, 2021
1 parent f0284a9 commit 629c7ea
Show file tree
Hide file tree
Showing 7 changed files with 104 additions and 76 deletions.
17 changes: 8 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@ Move or copy your files according to your preferred format, using any combinatio

```bash
$ catalogue --help
usage: catalogue [-h] [--version] [--verbose]
[--operation {move,copy,dry-run}] [--src SRC_PATH]
[--dst DST_PATH] [--unknown-folder UNKNOWN_FOLDER]
[--format PATH_FORMAT]
usage: catalogue [-h] [--version] [--verbose] [--operation {move,copy,dry-run}]
[--src SRC_PATH] [--dst DST_PATH] [--format PATH_FORMAT]
[--unknown-folder UNKNOWN_FOLDER]

Organize your photos folder,.
Example usage:
Expand All @@ -29,11 +28,11 @@ optional arguments:
Specify how to move files (copy, move or dry-run)
--src SRC_PATH Path to the source directory.
--dst DST_PATH Path to the destination directory.
--format PATH_FORMAT Customize how to structure the files in your catalogue. By default : '%Y/%m/%d/{filename}
All python strftime format codes are supported as well as {filename}, {basename}, {filename_extension}, {media_type}
--unknown-folder UNKNOWN_FOLDER
If provided will be used for media without creation date
It accepts same options as the format flag, strftime format will refer to current time
--format PATH_FORMAT Customize how to structure the files in your catalogue. e.g: '%Y/%m/%d/{filename}
All python strftime format codes are supported as well as {filename}, {basename}, {filename_extension}, {media_type}
It accepts same options as the format argument, strftime format codes will refer to current time instead of creation date
```
## Requirements
Expand All @@ -49,14 +48,14 @@ The easier way to run this project is using docker:
Run example; notice `source` and `my_catalogue` need to be replace with your destinations:
docker run --rm -v $(pwd)/source:/input:ro -v $(pwd)/my_catalogue:/output iago1460/catalogue:1.2.5 --src /input --dst /output --operation copy
docker run --rm -v $(pwd)/source:/input:ro -v $(pwd)/my_catalogue:/output iago1460/catalogue:1.2.6 --src /input --dst /output --operation copy
### In a virtual environment
virtualenv venv
source venv/bin/activate
pip3 install https://github.com/iago1460/photo-cataloguer/archive/1.2.5.zip
pip3 install https://github.com/iago1460/photo-cataloguer/archive/1.2.6.zip
catalogue --help
Expand Down
2 changes: 1 addition & 1 deletion catalogue/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
if sys.version_info < (3, 9):
raise RuntimeError("Python 3.9 or later is required")

__version__ = "1.0"
__version__ = "1.2.6"
39 changes: 20 additions & 19 deletions catalogue/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,24 +74,24 @@ def main():
required=False,
default=None,
)
parser.add_argument(
"--unknown-folder",
help="If provided will be used for media without creation date\n"
"It accepts same options as the format flag, strftime format will refer to current time",
dest="unknown_folder",
type=str,
required=False,
default=None,
)
parser.add_argument(
"--format",
help="Customize how to structure the files in your catalogue. e.g: '%%Y/%%m/%%d/{filename}\n"
help="Customize how to structure the files in your catalogue. By default : '%%Y/%%m/%%d/{filename}\n"
"All python strftime format codes are supported as well as {filename}, {basename}, {filename_extension}, {media_type}",
dest="path_format",
type=str,
required=False,
default="%Y/%m/%d/{filename}",
)
parser.add_argument(
"--unknown-folder",
help="If provided will be used for media without creation date\n"
"It accepts same options as the format argument, strftime format codes will refer to current time instead of creation date",
dest="unknown_folder",
type=str,
required=False,
default=None,
)

args = parser.parse_args()
start_dt = datetime.now()
Expand Down Expand Up @@ -120,11 +120,11 @@ def main():
duplicated_files_list = src_catalogue.detect_duplicates_with(dst_catalogue)

if duplicated_files_list:
logging.info(f"Ignoring some duplicates files which are already present")
logging.info(f"Ignoring some duplicate files which are already present")
if args.verbose:
for files_list in duplicated_files_list:
logging.info(
" * {files}".format(
" - {files}".format(
files=", ".join(sorted(map(escape, files_list)))
)
)
Expand All @@ -140,7 +140,7 @@ def main():
)
for files_list in duplicated_list_of_files_to_import:
logging.info(
" * {files}".format(files=", ".join(sorted(map(escape, files_list))))
" - {files}".format(files=", ".join(sorted(map(escape, files_list))))
)
# Remove each first file from the list so it gets imported
duplicated_list_of_files_to_import = set(
Expand Down Expand Up @@ -168,10 +168,7 @@ def main():
logging.debug(f"Skipping '{media_type}' file {file.path}")
continue

if (
file in duplicated_files
or file in duplicated_list_of_files_to_import
):
if file in duplicated_files or file in duplicated_list_of_files_to_import:
logging.debug(f"Skipping duplicated file {file.path}")
continue

Expand Down Expand Up @@ -208,7 +205,9 @@ def main():
)
imported_files.append(processed_file)

if args.operation != Operation.DRY_RUN: # shouldn't save if dry run (data is messed up too)
if (
args.operation != Operation.DRY_RUN
): # shouldn't save if dry run (data is messed up too)
logging.info("Saving catalogue...")
dst_catalogue.save_db()

Expand All @@ -235,10 +234,12 @@ def generate_filename(file, path_format, dt, parent_folder, media_type):
def process_file(file, operation, dst_catalogue, dst_file_path):
path_available = dst_catalogue.is_path_available(dst_file_path)
if not path_available:
logging.debug(f"Path {dst_file_path} not available, renaming file")
dst_file_path = dst_catalogue.find_new_path(dst_file_path)

if operation == Operation.DRY_RUN:
logging.info(f"dry-run: {file.path} -> {dst_file_path}")
collision_indicator = ' *' if not path_available else ''
logging.info(f"dry-run: {file.path} -> {dst_file_path}{collision_indicator}")
file.path = dst_file_path
dst_catalogue.add_file(file) # needed so path_available is more accurate
return None
Expand Down
45 changes: 24 additions & 21 deletions catalogue/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,32 +29,35 @@ def _normalize_datetime_format(exif_dt_field):
except ValueError:
return exif_dt
except TypeError:
logging.info(f'Cannot parse "{exif_dt}", {exif_dt_field}')
logging.debug(f'Cannot parse {exif_dt=}')
return None


def _extract_created_date_from_exif(exif):
created_data = exif.get("DateTimeOriginal") or exif.get("DateTime")

created_data = _normalize_datetime_format(created_data)

if created_data:
try:
return dateutil.parser.parse(created_data)
except ValueError as e:
if e.args[0] == "Unknown string format: %s":
unknown_date = e.args[1]
logging.debug(f"Attempting to parse unknown date {unknown_date}")
match = re.search(
r"(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})", unknown_date
exif_dt_field = exif.get("DateTimeOriginal") or exif.get("DateTime")

if not exif_dt_field:
return None

created_data = _normalize_datetime_format(exif_dt_field)
if not created_data:
return None

try:
return dateutil.parser.parse(created_data)
except ValueError as e:
if e.args[0] == "Unknown string format: %s":
unknown_date = e.args[1]
logging.debug(f"Attempting to parse unknown date {unknown_date}")
match = re.search(
r"(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})", unknown_date
)
if match:
year, month, day = match.groups()
return datetime.datetime(
year=int(year), month=int(month), day=int(day)
)
if match:
year, month, day = match.groups()
return datetime.datetime(
year=int(year), month=int(month), day=int(day)
)
raise e
return None
raise e


IMAGE_PATH_REGEXES = (
Expand Down
70 changes: 47 additions & 23 deletions catalogue/model.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import json
import hashlib
import json
import logging
import magic
import os
from contextlib import suppress

import magic
import shutil
from datetime import datetime, timezone, timedelta
from itertools import chain
from pathlib import PurePath, Path
from contextlib import suppress
from catalogue.metadata import get_image_creation_date, get_path_creation_date

from datetime import datetime, timezone, timedelta
from catalogue import __version__

from catalogue.metadata import get_image_creation_date, get_path_creation_date

CATALOGUE_EXPIRY_DELTA = timedelta(days=1)

Expand Down Expand Up @@ -138,7 +138,7 @@ def asdict(self):

class Catalogue:
root_path = None
last_update = None
creation_date = None
_files = None
_files_by_path = None
_files_by_size = None
Expand All @@ -149,10 +149,11 @@ class Catalogue:
def files(self):
return tuple(self._files.copy())

def __init__(self, root_path: Path, files=None, last_update=None):
def __init__(
self, root_path: Path, files=None, creation_date=None):
self.root_path = root_path
self._files = []
self.last_update = last_update
self.creation_date = creation_date
self._files_by_path = {}
self._files_by_size = {}
self._files_by_short_hash = {}
Expand All @@ -163,23 +164,27 @@ def __init__(self, root_path: Path, files=None, last_update=None):
@classmethod
def load(cls, path):
db_data = cls._load_data_from_database(path)
if db_data:
last_update = datetime.fromisoformat(db_data["last_update"])
if (
db_data["version"] == __version__
or datetime.now(timezone.utc) - last_update < CATALOGUE_EXPIRY_DELTA
if db_data and db_data.get("version") == __version__:
creation_date = datetime.fromisoformat(db_data["creation_date"])

if datetime.now(
timezone.utc
) - creation_date < CATALOGUE_EXPIRY_DELTA and len(db_data['files']) == count_number_of_files(
path
):
logging.debug(
"Database file seems suitable, using it to speed up things!"
)
logging.debug("Database found, using it to speed up things!")
files = [
File(**{**file_data, "path": path.joinpath(file_data["path"])})
for file_data in db_data["files"]
]
return cls(path, files=files, last_update=last_update)
logging.debug("Database seems outdated, reverting to scan.")
return cls(
path,
files=files,
creation_date=creation_date,
)
logging.debug(f"Database found but seems outdated, ignoring it")

logging.debug(f"Database not found, scanning {path}...")
logging.debug(f"Scanning {path}...")
return cls._generate_catalogue_from_scan(path)

@classmethod
Expand Down Expand Up @@ -209,7 +214,11 @@ def _generate_catalogue_from_scan(cls, path):
logging.warning("Cannot read %s: %s", full_path, e)
continue
files.append(File(path=file_path, size=file_size))
return cls(root_path=path, files=files, last_update=datetime.now(timezone.utc))
return cls(
root_path=path,
files=files,
creation_date=datetime.now(timezone.utc),
)

def notify(self, file, field, new_value):
"""
Expand All @@ -224,7 +233,9 @@ def notify(self, file, field, new_value):
with suppress(ValueError):
self._files_by_size.setdefault(file.size, []).remove(file)
with suppress(ValueError):
self._files_by_short_hash.setdefault(file._short_hash, []).remove(file)
self._files_by_short_hash.setdefault(file._short_hash, []).remove(
file
)
with suppress(ValueError):
self._files_by_hash.setdefault(file._hash, []).remove(file)
return
Expand Down Expand Up @@ -298,12 +309,17 @@ def file_asdict(file):

return {
"version": __version__,
"last_update": self.last_update.isoformat(),
"creation_date": self.creation_date.isoformat(),
"files": [file_asdict(file) for file in self._files],
}

def save_db(self):
db_path = self.root_path.joinpath(DATABASE_LOCATION)

# Adding db file since we are going to save it below and will mess file counting next time
if self.is_path_available(db_path):
self.add_file(File(path=db_path))

db_data = self.asdict()
with open(db_path, "w") as db_file:
# json.dump(db_data, db_file, indent=4) # debug
Expand All @@ -323,6 +339,14 @@ def split_extension(name):
return name, ""


def count_number_of_files(path):
file_count = 0
for dirpath, dirnames, filenames in os.walk(path):
for _ in filenames:
file_count += 1
return file_count


def _chunk_reader(fobj, chunk_size=1024):
"""Generator that reads a file in chunks of bytes"""
while True:
Expand Down
2 changes: 1 addition & 1 deletion catalogue/tests/fixtures/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,5 @@ def catalogue():
assert TEST_CATALOGUE_PATH.exists()
return Catalogue(
TEST_CATALOGUE_PATH,
last_update=datetime(2021, 1, 1, 22, 00, 30, tzinfo=timezone.utc),
creation_date=datetime(2021, 1, 1, 22, 00, 30, tzinfo=timezone.utc),
)
5 changes: 3 additions & 2 deletions catalogue/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest

from catalogue.model import split_extension
from catalogue import __version__


@pytest.mark.parametrize(
Expand Down Expand Up @@ -40,8 +41,8 @@ def test_catalogue_asdict(catalogue, text_file):
"files": [
{"hash": None, "path": "folder/text.txt", "short_hash": None, "size": None}
],
"last_update": "2021-01-01T22:00:30+00:00",
"version": "1.0",
"creation_date": "2021-01-01T22:00:30+00:00",
"version": __version__,
}


Expand Down

0 comments on commit 629c7ea

Please sign in to comment.