From 629c7eaccfcc2f9ac25ff29643059ddb85865bb7 Mon Sep 17 00:00:00 2001
From: Iago Veloso <iago.veloso@thisisglobal.com>
Date: Thu, 27 May 2021 23:01:08 +0100
Subject: [PATCH] Improving use of db_catalogue

---
 README.md                              | 17 +++----
 catalogue/__init__.py                  |  2 +-
 catalogue/main.py                      | 39 +++++++-------
 catalogue/metadata.py                  | 45 +++++++++--------
 catalogue/model.py                     | 70 +++++++++++++++++---------
 catalogue/tests/fixtures/filesystem.py |  2 +-
 catalogue/tests/test_utils.py          |  5 +-
 7 files changed, 104 insertions(+), 76 deletions(-)

diff --git a/README.md b/README.md
index 11d132e..21f6b48 100644
--- a/README.md
+++ b/README.md
@@ -12,10 +12,9 @@ Move or copy your files according to your preferred format, using any combinatio
 
 ```bash
 $ catalogue --help
-usage: catalogue [-h] [--version] [--verbose]
-                 [--operation {move,copy,dry-run}] [--src SRC_PATH]
-                 [--dst DST_PATH] [--unknown-folder UNKNOWN_FOLDER]
-                 [--format PATH_FORMAT]
+usage: catalogue [-h] [--version] [--verbose] [--operation {move,copy,dry-run}]
+               [--src SRC_PATH] [--dst DST_PATH] [--format PATH_FORMAT]
+               [--unknown-folder UNKNOWN_FOLDER]
 
 Organize your photos folder,.
 Example usage:
@@ -29,11 +28,11 @@ optional arguments:
                         Specify how to move files (copy, move or dry-run)
   --src SRC_PATH        Path to the source directory.
   --dst DST_PATH        Path to the destination directory.
+  --format PATH_FORMAT  Customize how to structure the files in your catalogue. By default : '%Y/%m/%d/{filename}
+                        All python strftime format codes are supported as well as {filename}, {basename}, {filename_extension}, {media_type}
   --unknown-folder UNKNOWN_FOLDER
                         If provided will be used for media without creation date
-                        It accepts same options as the format flag, strftime format will refer to current time
-  --format PATH_FORMAT  Customize how to structure the files in your catalogue. e.g: '%Y/%m/%d/{filename}
-                        All python strftime format codes are supported as well as {filename}, {basename}, {filename_extension}, {media_type}
+                        It accepts same options as the format argument, strftime format codes will refer to current time instead of creation date
 ```
 
 ## Requirements
@@ -49,14 +48,14 @@ The easier way to run this project is using docker:
 
 Run example; notice `source` and `my_catalogue` need to be replace with your destinations:
 
-    docker run --rm -v $(pwd)/source:/input:ro -v $(pwd)/my_catalogue:/output iago1460/catalogue:1.2.5 --src /input --dst /output --operation copy
+    docker run --rm -v $(pwd)/source:/input:ro -v $(pwd)/my_catalogue:/output iago1460/catalogue:1.2.6 --src /input --dst /output --operation copy
 
 
 ### In a virtual environment
 
     virtualenv venv
     source venv/bin/activate
-    pip3 install https://github.com/iago1460/photo-cataloguer/archive/1.2.5.zip
+    pip3 install https://github.com/iago1460/photo-cataloguer/archive/1.2.6.zip
     catalogue --help
 
 
diff --git a/catalogue/__init__.py b/catalogue/__init__.py
index 5f8e8a7..b56df3a 100644
--- a/catalogue/__init__.py
+++ b/catalogue/__init__.py
@@ -3,4 +3,4 @@
 if sys.version_info < (3, 9):
     raise RuntimeError("Python 3.9 or later is required")
 
-__version__ = "1.0"
+__version__ = "1.2.6"
diff --git a/catalogue/main.py b/catalogue/main.py
index 8409281..7ad7639 100644
--- a/catalogue/main.py
+++ b/catalogue/main.py
@@ -74,24 +74,24 @@ def main():
         required=False,
         default=None,
     )
-    parser.add_argument(
-        "--unknown-folder",
-        help="If provided will be used for media without creation date\n"
-        "It accepts same options as the format flag, strftime format will refer to current time",
-        dest="unknown_folder",
-        type=str,
-        required=False,
-        default=None,
-    )
     parser.add_argument(
         "--format",
-        help="Customize how to structure the files in your catalogue. e.g: '%%Y/%%m/%%d/{filename}\n"
+        help="Customize how to structure the files in your catalogue. By default : '%%Y/%%m/%%d/{filename}\n"
         "All python strftime format codes are supported as well as {filename}, {basename}, {filename_extension}, {media_type}",
         dest="path_format",
         type=str,
         required=False,
         default="%Y/%m/%d/{filename}",
     )
+    parser.add_argument(
+        "--unknown-folder",
+        help="If provided will be used for media without creation date\n"
+        "It accepts same options as the format argument, strftime format codes will refer to current time instead of creation date",
+        dest="unknown_folder",
+        type=str,
+        required=False,
+        default=None,
+    )
 
     args = parser.parse_args()
     start_dt = datetime.now()
@@ -120,11 +120,11 @@ def main():
         duplicated_files_list = src_catalogue.detect_duplicates_with(dst_catalogue)
 
     if duplicated_files_list:
-        logging.info(f"Ignoring some duplicates files which are already present")
+        logging.info(f"Ignoring some duplicate files which are already present")
         if args.verbose:
             for files_list in duplicated_files_list:
                 logging.info(
-                    "  * {files}".format(
+                    "  - {files}".format(
                         files=", ".join(sorted(map(escape, files_list)))
                     )
                 )
@@ -140,7 +140,7 @@ def main():
         )
         for files_list in duplicated_list_of_files_to_import:
             logging.info(
-                "  * {files}".format(files=", ".join(sorted(map(escape, files_list))))
+                "  - {files}".format(files=", ".join(sorted(map(escape, files_list))))
             )
         # Remove each first file from the list so it gets imported
         duplicated_list_of_files_to_import = set(
@@ -168,10 +168,7 @@ def main():
                 logging.debug(f"Skipping '{media_type}' file {file.path}")
                 continue
 
-            if (
-                file in duplicated_files
-                or file in duplicated_list_of_files_to_import
-            ):
+            if file in duplicated_files or file in duplicated_list_of_files_to_import:
                 logging.debug(f"Skipping duplicated file {file.path}")
                 continue
 
@@ -208,7 +205,9 @@ def main():
                 )
                 imported_files.append(processed_file)
 
-        if args.operation != Operation.DRY_RUN:  # shouldn't save if dry run (data is messed up too)
+        if (
+            args.operation != Operation.DRY_RUN
+        ):  # shouldn't save if dry run (data is messed up too)
             logging.info("Saving catalogue...")
             dst_catalogue.save_db()
 
@@ -235,10 +234,12 @@ def generate_filename(file, path_format, dt, parent_folder, media_type):
 def process_file(file, operation, dst_catalogue, dst_file_path):
     path_available = dst_catalogue.is_path_available(dst_file_path)
     if not path_available:
+        logging.debug(f"Path {dst_file_path} not available, renaming file")
         dst_file_path = dst_catalogue.find_new_path(dst_file_path)
 
     if operation == Operation.DRY_RUN:
-        logging.info(f"dry-run: {file.path} -> {dst_file_path}")
+        collision_indicator = ' *' if not path_available else ''
+        logging.info(f"dry-run: {file.path} -> {dst_file_path}{collision_indicator}")
         file.path = dst_file_path
         dst_catalogue.add_file(file)  # needed so path_available is more accurate
         return None
diff --git a/catalogue/metadata.py b/catalogue/metadata.py
index 54fc353..65b8ec8 100644
--- a/catalogue/metadata.py
+++ b/catalogue/metadata.py
@@ -29,32 +29,35 @@ def _normalize_datetime_format(exif_dt_field):
     except ValueError:
         return exif_dt
     except TypeError:
-        logging.info(f'Cannot parse "{exif_dt}", {exif_dt_field}')
+        logging.debug(f'Cannot parse {exif_dt=}')
         return None
 
 
 def _extract_created_date_from_exif(exif):
-    created_data = exif.get("DateTimeOriginal") or exif.get("DateTime")
-
-    created_data = _normalize_datetime_format(created_data)
-
-    if created_data:
-        try:
-            return dateutil.parser.parse(created_data)
-        except ValueError as e:
-            if e.args[0] == "Unknown string format: %s":
-                unknown_date = e.args[1]
-                logging.debug(f"Attempting to parse unknown date {unknown_date}")
-                match = re.search(
-                    r"(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})", unknown_date
+    exif_dt_field = exif.get("DateTimeOriginal") or exif.get("DateTime")
+
+    if not exif_dt_field:
+        return None
+
+    created_data = _normalize_datetime_format(exif_dt_field)
+    if not created_data:
+        return None
+
+    try:
+        return dateutil.parser.parse(created_data)
+    except ValueError as e:
+        if e.args[0] == "Unknown string format: %s":
+            unknown_date = e.args[1]
+            logging.debug(f"Attempting to parse unknown date {unknown_date}")
+            match = re.search(
+                r"(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})", unknown_date
+            )
+            if match:
+                year, month, day = match.groups()
+                return datetime.datetime(
+                    year=int(year), month=int(month), day=int(day)
                 )
-                if match:
-                    year, month, day = match.groups()
-                    return datetime.datetime(
-                        year=int(year), month=int(month), day=int(day)
-                    )
-            raise e
-    return None
+        raise e
 
 
 IMAGE_PATH_REGEXES = (
diff --git a/catalogue/model.py b/catalogue/model.py
index ff30144..5193ff2 100644
--- a/catalogue/model.py
+++ b/catalogue/model.py
@@ -1,17 +1,17 @@
-import json
 import hashlib
+import json
 import logging
-import magic
 import os
+from contextlib import suppress
+
+import magic
 import shutil
+from datetime import datetime, timezone, timedelta
 from itertools import chain
 from pathlib import PurePath, Path
-from contextlib import suppress
-from catalogue.metadata import get_image_creation_date, get_path_creation_date
 
-from datetime import datetime, timezone, timedelta
 from catalogue import __version__
-
+from catalogue.metadata import get_image_creation_date, get_path_creation_date
 
 CATALOGUE_EXPIRY_DELTA = timedelta(days=1)
 
@@ -138,7 +138,7 @@ def asdict(self):
 
 class Catalogue:
     root_path = None
-    last_update = None
+    creation_date = None
     _files = None
     _files_by_path = None
     _files_by_size = None
@@ -149,10 +149,11 @@ class Catalogue:
     def files(self):
         return tuple(self._files.copy())
 
-    def __init__(self, root_path: Path, files=None, last_update=None):
+    def __init__(
+        self, root_path: Path, files=None, creation_date=None):
         self.root_path = root_path
         self._files = []
-        self.last_update = last_update
+        self.creation_date = creation_date
         self._files_by_path = {}
         self._files_by_size = {}
         self._files_by_short_hash = {}
@@ -163,23 +164,27 @@ def __init__(self, root_path: Path, files=None, last_update=None):
     @classmethod
     def load(cls, path):
         db_data = cls._load_data_from_database(path)
-        if db_data:
-            last_update = datetime.fromisoformat(db_data["last_update"])
-            if (
-                db_data["version"] == __version__
-                or datetime.now(timezone.utc) - last_update < CATALOGUE_EXPIRY_DELTA
+        if db_data and db_data.get("version") == __version__:
+            creation_date = datetime.fromisoformat(db_data["creation_date"])
+
+            if datetime.now(
+                timezone.utc
+            ) - creation_date < CATALOGUE_EXPIRY_DELTA and len(db_data['files']) == count_number_of_files(
+                path
             ):
-                logging.debug(
-                    "Database file seems suitable, using it to speed up things!"
-                )
+                logging.debug("Database found, using it to speed up things!")
                 files = [
                     File(**{**file_data, "path": path.joinpath(file_data["path"])})
                     for file_data in db_data["files"]
                 ]
-                return cls(path, files=files, last_update=last_update)
-            logging.debug("Database seems outdated, reverting to scan.")
+                return cls(
+                    path,
+                    files=files,
+                    creation_date=creation_date,
+                )
+            logging.debug(f"Database found but seems outdated, ignoring it")
 
-        logging.debug(f"Database not found, scanning {path}...")
+        logging.debug(f"Scanning {path}...")
         return cls._generate_catalogue_from_scan(path)
 
     @classmethod
@@ -209,7 +214,11 @@ def _generate_catalogue_from_scan(cls, path):
                     logging.warning("Cannot read %s: %s", full_path, e)
                     continue
                 files.append(File(path=file_path, size=file_size))
-        return cls(root_path=path, files=files, last_update=datetime.now(timezone.utc))
+        return cls(
+            root_path=path,
+            files=files,
+            creation_date=datetime.now(timezone.utc),
+        )
 
     def notify(self, file, field, new_value):
         """
@@ -224,7 +233,9 @@ def notify(self, file, field, new_value):
                 with suppress(ValueError):
                     self._files_by_size.setdefault(file.size, []).remove(file)
                 with suppress(ValueError):
-                    self._files_by_short_hash.setdefault(file._short_hash, []).remove(file)
+                    self._files_by_short_hash.setdefault(file._short_hash, []).remove(
+                        file
+                    )
                 with suppress(ValueError):
                     self._files_by_hash.setdefault(file._hash, []).remove(file)
                 return
@@ -298,12 +309,17 @@ def file_asdict(file):
 
         return {
             "version": __version__,
-            "last_update": self.last_update.isoformat(),
+            "creation_date": self.creation_date.isoformat(),
             "files": [file_asdict(file) for file in self._files],
         }
 
     def save_db(self):
         db_path = self.root_path.joinpath(DATABASE_LOCATION)
+
+        # Adding db file since we are going to save it below and will mess file counting next time
+        if self.is_path_available(db_path):
+            self.add_file(File(path=db_path))
+
         db_data = self.asdict()
         with open(db_path, "w") as db_file:
             # json.dump(db_data, db_file, indent=4) # debug
@@ -323,6 +339,14 @@ def split_extension(name):
     return name, ""
 
 
+def count_number_of_files(path):
+    file_count = 0
+    for dirpath, dirnames, filenames in os.walk(path):
+        for _ in filenames:
+            file_count += 1
+    return file_count
+
+
 def _chunk_reader(fobj, chunk_size=1024):
     """Generator that reads a file in chunks of bytes"""
     while True:
diff --git a/catalogue/tests/fixtures/filesystem.py b/catalogue/tests/fixtures/filesystem.py
index 39de937..99732e0 100644
--- a/catalogue/tests/fixtures/filesystem.py
+++ b/catalogue/tests/fixtures/filesystem.py
@@ -20,5 +20,5 @@ def catalogue():
     assert TEST_CATALOGUE_PATH.exists()
     return Catalogue(
         TEST_CATALOGUE_PATH,
-        last_update=datetime(2021, 1, 1, 22, 00, 30, tzinfo=timezone.utc),
+        creation_date=datetime(2021, 1, 1, 22, 00, 30, tzinfo=timezone.utc),
     )
diff --git a/catalogue/tests/test_utils.py b/catalogue/tests/test_utils.py
index e3b0435..5346ae2 100644
--- a/catalogue/tests/test_utils.py
+++ b/catalogue/tests/test_utils.py
@@ -1,6 +1,7 @@
 import pytest
 
 from catalogue.model import split_extension
+from catalogue import __version__
 
 
 @pytest.mark.parametrize(
@@ -40,8 +41,8 @@ def test_catalogue_asdict(catalogue, text_file):
         "files": [
             {"hash": None, "path": "folder/text.txt", "short_hash": None, "size": None}
         ],
-        "last_update": "2021-01-01T22:00:30+00:00",
-        "version": "1.0",
+        "creation_date": "2021-01-01T22:00:30+00:00",
+        "version": __version__,
     }