From ce5598bd64b93a03bb89c372c0ba270f80639794 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xinru=20Yan=20=F0=9F=87=A8=F0=9F=87=B3=20=20=F0=9F=91=91?= Date: Mon, 14 Mar 2022 19:38:37 -0700 Subject: [PATCH 1/9] Added pdf and attachment hashes to bin/anthology --- bin/anthology/papers.py | 4 ++++ bin/anthology/utils.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/bin/anthology/papers.py b/bin/anthology/papers.py index 989478b4c2..74eaed0e74 100644 --- a/bin/anthology/papers.py +++ b/bin/anthology/papers.py @@ -97,6 +97,10 @@ def videos(self): ] return [] + @cached_property + def pdf_hash(self): + return self.attrib.get("pdf_hash", None) + def _parse_revision_or_errata(self, tag): for item in self.attrib.get(tag, []): # Expand URLs with paper ID diff --git a/bin/anthology/utils.py b/bin/anthology/utils.py index 284fb63ee2..ae231cc9ea 100644 --- a/bin/anthology/utils.py +++ b/bin/anthology/utils.py @@ -20,6 +20,7 @@ import re import requests import shutil +import subprocess from lxml import etree from urllib.parse import urlparse @@ -415,11 +416,13 @@ def parse_element(xml_element): elif tag == "url": tag = "xml_url" value = element.text + attrib['pdf_hash'] = element.get("hash") elif tag == "attachment": value = { "filename": element.text, "type": element.get("type", "attachment"), "url": element.text, + "hash": element.get("hash"), } elif tag in ("author", "editor"): id_ = element.attrib.get("id", None) From a6ff39cf5fc9209b50de13c0eb586cfcdb938ca8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xinru=20Yan=20=F0=9F=87=A8=F0=9F=87=B3=20=20=F0=9F=91=91?= Date: Mon, 14 Mar 2022 19:46:59 -0700 Subject: [PATCH 2/9] Added upload_file_to_queue method to bin/utils --- bin/anthology/data.py | 6 ++++++ bin/anthology/utils.py | 45 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/bin/anthology/data.py b/bin/anthology/data.py index 4d47f1dd8d..6af0722358 100644 --- a/bin/anthology/data.py +++ b/bin/anthology/data.py @@ -22,6 +22,7 @@ ################################################################################ import os +from enum import Enum # this is the canonical URL. In contrast to all other # URL templates, it always links to the official anthology. @@ -98,3 +99,8 @@ def get_journal_title(top_level_id, volume_title): return "Transactions of the Association for Computational Linguistics" else: return volume_title + + +class ResourceType(Enum): + PDF = 'pdf' + ATTACHMENT = 'attachments' diff --git a/bin/anthology/utils.py b/bin/anthology/utils.py index ae231cc9ea..21b957b14b 100644 --- a/bin/anthology/utils.py +++ b/bin/anthology/utils.py @@ -492,3 +492,48 @@ def compute_hash(value: bytes) -> str: def compute_hash_from_file(path: str) -> str: with open(path, "rb") as f: return compute_hash(f.read()) + + +# For auto upload files to server +# The root directory for files +ANTHOLOGY_FILE_ROOT = "anthology-files" + +# The ssh shortcut (in ~/.ssh/config) or full hostname +ANTHOLOGY_HOST = "anth" + + +def upload_file_to_queue( + local_path: str, + resource_type: data.ResourceType, + venue_name: str, + filename: str, + file_hash: str, + commit: bool = False, +): + actual_hash = compute_hash_from_file(local_path) + if file_hash != actual_hash: + raise Exception( + f"Got unexpected hash, file contains incorrect data. (actual hash: {actual_hash}, expected: {file_hash})" + ) + + mdkir_cmd = [ + 'ssh', + ANTHOLOGY_HOST, + f'mkdir -p {ANTHOLOGY_FILE_ROOT}/queue/{resource_type.value}/{venue_name}', + ] + if commit: + subprocess.check_call(mdkir_cmd) + else: + logging.info(f"Would run: {mdkir_cmd}") + + upload_cmd = [ + "rsync", + "-lptgoDve", + "ssh", + local_path, + f"{ANTHOLOGY_HOST}:{ANTHOLOGY_FILE_ROOT}/queue/{resource_type.value}/{venue_name}/{filename}.{file_hash}", + ] + if commit: + subprocess.check_call(upload_cmd) + else: + logging.info(f"Would run: {upload_cmd}") From 6765599828625087231f82573c7fe33e7a3fa684 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xinru=20Yan=20=F0=9F=87=A8=F0=9F=87=B3=20=20=F0=9F=91=91?= Date: Mon, 14 Mar 2022 20:38:33 -0700 Subject: [PATCH 3/9] Added volume hash to bin/anthology --- bin/anthology/volumes.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bin/anthology/volumes.py b/bin/anthology/volumes.py index d3377568d2..77a3b6fd7e 100644 --- a/bin/anthology/volumes.py +++ b/bin/anthology/volumes.py @@ -104,6 +104,10 @@ def pdf(self): return infer_url(url, template=data.PDF_LOCATION_TEMPLATE) return None + @cached_property + def pdf_hash(self): + return self.attrib.get("pdf_hash", None) + def _set_meta_info(self, meta_data): """Derive journal title, volume, and issue no. used in metadata. From 2e1c728475807677f2d7022d8a7dfd9688b572d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xinru=20Yan=20=F0=9F=87=A8=F0=9F=87=B3=20=20=F0=9F=91=91?= Date: Mon, 14 Mar 2022 20:40:03 -0700 Subject: [PATCH 4/9] Added enqueue_files script, step 1 for #1818 --- bin/anthology/data.py | 19 +++++ bin/enqueue_files.py | 156 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 175 insertions(+) create mode 100644 bin/enqueue_files.py diff --git a/bin/anthology/data.py b/bin/anthology/data.py index 6af0722358..25ec114d95 100644 --- a/bin/anthology/data.py +++ b/bin/anthology/data.py @@ -54,6 +54,25 @@ "ANTHOLOGY_FILES", os.path.join(os.environ["HOME"], "anthology-files") ) +# Anthology pdf location +# Defaults to {ANTHOLOGY_FILE_DIR}/pdf +ANTHOLOGY_PDF_DIR = os.environ.get( + "ANTHOLOGY_PDFS", os.path.join(ANTHOLOGY_FILE_DIR, "pdf") +) + +# Anthology attachments location +# Defaults to {ANTHOLOGY_FILE_DIR}/attachments +ANTHOLOGY_ATTACHMENTS_DIR = os.environ.get( + "ANTHOLOGY_ATTACHMENTS", os.path.join(ANTHOLOGY_FILE_DIR, "attachments") +) + +# Anthology data location +# Defaults to {git_repo_root}/data +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +ANTHOLOGY_DATA_DIR = os.environ.get( + "ANTHOLOGY_DATA", os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "..", "data")) +) + # Names of XML elements that may appear multiple times LIST_ELEMENTS = ( "attachment", diff --git a/bin/enqueue_files.py b/bin/enqueue_files.py new file mode 100644 index 0000000000..71ff8f1077 --- /dev/null +++ b/bin/enqueue_files.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Copyright 2022 Xinru Yan +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import click +import logging as log +from enum import Enum +from functools import partial + +from anthology import Anthology +from anthology.utils import upload_file_to_queue +from anthology.utils import SeverityTracker +from anthology.data import ( + ANTHOLOGY_ATTACHMENTS_DIR, + ANTHOLOGY_DATA_DIR, + ANTHOLOGY_PDF_DIR, + ResourceType, +) + + +# Enable show default by default +click.option = partial(click.option, show_default=True) + + +def get_proceedings_id_from_filename(resource_type: ResourceType, filename: str) -> str: + trailing_dots = {ResourceType.PDF: 1, ResourceType.ATTACHMENT: 2}[resource_type] + return filename.rsplit('.', trailing_dots)[0] + + +def get_hash_for_resource( + anth: Anthology, resource_type: ResourceType, filename: str +) -> str: + proceedings_id = get_proceedings_id_from_filename(resource_type, filename) + if proceedings_id not in anth.papers and proceedings_id not in anth.volumes: + raise Exception(f"Paper/Volume for PDF {proceedings_id!r} does not exist.") + + resource_hash = None + if resource_type == ResourceType.PDF: + resource_hash = anth.papers.get( + proceedings_id, anth.volumes.get(proceedings_id) + ).pdf_hash + elif resource_type == ResourceType.ATTACHMENT: + attachments = anth.papers[proceedings_id].attachments + filename_to_hash = {a['filename']: a['hash'] for a in attachments} + resource_hash = filename_to_hash.get(filename) + + if resource_hash is None: + raise Exception( + "Hash for resource is None. Please update with value before running this script." + ) + + return resource_hash + + +# Iterate over files in resource directory, find the hash in the Anthology and upload the file (if commit) +def enqueue_dir( + anth: Anthology, + resource_directory: str, + resource_type: ResourceType, + commit: bool = False, +): + for venue_name in os.listdir(resource_directory): + for filename in os.listdir(os.path.join(resource_directory, venue_name)): + local_path = os.path.join(resource_directory, venue_name, filename) + + # Get resource hash + try: + resource_hash = get_hash_for_resource(anth, resource_type, filename) + except Exception as e: + log.error(f"{e} (filename: {local_path!r})", exc_info=True) + continue + + upload_file_to_queue( + local_path, + resource_type=resource_type, + venue_name=venue_name, + filename=filename, + file_hash=resource_hash, + commit=commit, + ) + + +@click.command() +@click.option( + '-i', + '--importdir', + type=click.Path(exists=True), + default=ANTHOLOGY_DATA_DIR, + help="Directory to import the Anthology XML files data files from.", +) +@click.option( + '-p', + '--pdfs-dir', + type=click.Path(exists=True), + default=ANTHOLOGY_PDF_DIR, + help="Root path for placement of PDF files", +) +@click.option( + '-a', + '--attachments-dir', + type=click.Path(exists=True), + default=ANTHOLOGY_ATTACHMENTS_DIR, + help="Root path for placement of PDF files", +) +@click.option( + '-c', + '--commit', + is_flag=True, + help="Commit (=write) the changes to the anthology server; will only do a dry run otherwise.", +) +@click.option('--debug', is_flag=True, help="Output debug-level log messages.") +def main(importdir, pdfs_dir, attachments_dir, commit, debug): + log_level = log.DEBUG if debug else log.INFO + log.basicConfig(format="%(levelname)-8s %(message)s", level=log_level) + tracker = SeverityTracker() + log.getLogger().addHandler(tracker) + + log.info("Instantiating the Anthology...") + anth = Anthology(importdir=importdir) + + log.info("Enqueuing PDFs...") + enqueue_dir(anth, pdfs_dir, ResourceType.PDF, commit) + + log.info("Enqueuing Attachments...") + enqueue_dir(anth, attachments_dir, ResourceType.ATTACHMENT, commit) + + if not commit: + if tracker.highest >= log.ERROR: + log.warning( + "There were errors! Please check them carefully before re-running this script with -c/--commit." + ) + else: + log.warning( + "Re-run this script with -c/--commit to upload these files to the server." + ) + + if tracker.highest >= log.ERROR: + exit(1) + + +if __name__ == "__main__": + main() From f216817484b14f652b0617cf5a740ff56d791085 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xinru=20Yan=20=F0=9F=87=A8=F0=9F=87=B3=20=20=F0=9F=91=91?= Date: Fri, 25 Mar 2022 20:58:41 -0700 Subject: [PATCH 5/9] minor updates: moved two functions' locations --- bin/anthology/anthology.py | 24 ++++++++++++++++++++++++ bin/anthology/utils.py | 9 +++++++++ bin/enqueue_files.py | 27 +-------------------------- 3 files changed, 34 insertions(+), 26 deletions(-) diff --git a/bin/anthology/anthology.py b/bin/anthology/anthology.py index c5adb6a9c9..61ef1e0329 100644 --- a/bin/anthology/anthology.py +++ b/bin/anthology/anthology.py @@ -28,6 +28,8 @@ from .venues import VenueIndex from .volumes import Volume from .sigs import SIGIndex +from .data import ResourceType +from .utils import get_proceedings_id_from_filename class Anthology: @@ -152,3 +154,25 @@ def import_file(self, filename): continue volume.append(parsed_paper) self.papers[full_id] = parsed_paper + + def get_hash_for_resource(self, resource_type: ResourceType, filename: str) -> str: + proceedings_id = get_proceedings_id_from_filename(resource_type, filename) + if proceedings_id not in self.papers and proceedings_id not in self.volumes: + raise Exception(f"Paper/Volume for PDF {proceedings_id!r} does not exist.") + + resource_hash = None + if resource_type == ResourceType.PDF: + resource_hash = self.papers.get( + proceedings_id, self.volumes.get(proceedings_id) + ).pdf_hash + elif resource_type == ResourceType.ATTACHMENT: + attachments = self.papers[proceedings_id].attachments + filename_to_hash = {a['filename']: a['hash'] for a in attachments} + resource_hash = filename_to_hash.get(filename) + + if resource_hash is None: + raise Exception( + "Hash for resource is None. Please update with value before running this script." + ) + + return resource_hash diff --git a/bin/anthology/utils.py b/bin/anthology/utils.py index 21b957b14b..95bf11217d 100644 --- a/bin/anthology/utils.py +++ b/bin/anthology/utils.py @@ -537,3 +537,12 @@ def upload_file_to_queue( subprocess.check_call(upload_cmd) else: logging.info(f"Would run: {upload_cmd}") + + +def get_proceedings_id_from_filename( + resource_type: data.ResourceType, filename: str +) -> str: + trailing_dots = {data.ResourceType.PDF: 1, data.ResourceType.ATTACHMENT: 2}[ + resource_type + ] + return filename.rsplit('.', trailing_dots)[0] diff --git a/bin/enqueue_files.py b/bin/enqueue_files.py index 71ff8f1077..8238f45963 100644 --- a/bin/enqueue_files.py +++ b/bin/enqueue_files.py @@ -41,31 +41,6 @@ def get_proceedings_id_from_filename(resource_type: ResourceType, filename: str) return filename.rsplit('.', trailing_dots)[0] -def get_hash_for_resource( - anth: Anthology, resource_type: ResourceType, filename: str -) -> str: - proceedings_id = get_proceedings_id_from_filename(resource_type, filename) - if proceedings_id not in anth.papers and proceedings_id not in anth.volumes: - raise Exception(f"Paper/Volume for PDF {proceedings_id!r} does not exist.") - - resource_hash = None - if resource_type == ResourceType.PDF: - resource_hash = anth.papers.get( - proceedings_id, anth.volumes.get(proceedings_id) - ).pdf_hash - elif resource_type == ResourceType.ATTACHMENT: - attachments = anth.papers[proceedings_id].attachments - filename_to_hash = {a['filename']: a['hash'] for a in attachments} - resource_hash = filename_to_hash.get(filename) - - if resource_hash is None: - raise Exception( - "Hash for resource is None. Please update with value before running this script." - ) - - return resource_hash - - # Iterate over files in resource directory, find the hash in the Anthology and upload the file (if commit) def enqueue_dir( anth: Anthology, @@ -79,7 +54,7 @@ def enqueue_dir( # Get resource hash try: - resource_hash = get_hash_for_resource(anth, resource_type, filename) + resource_hash = anth.get_hash_for_resource(anth, resource_type, filename) except Exception as e: log.error(f"{e} (filename: {local_path!r})", exc_info=True) continue From eae6eab5b07de688bd32304c5aba6235a8f05774 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xinru=20Yan=20=F0=9F=87=A8=F0=9F=87=B3=20=20=F0=9F=91=91?= Date: Mon, 28 Mar 2022 09:05:46 -0700 Subject: [PATCH 6/9] first pass for commit_queue.py --- bin/commit_queue.py | 334 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 334 insertions(+) create mode 100644 bin/commit_queue.py diff --git a/bin/commit_queue.py b/bin/commit_queue.py new file mode 100644 index 0000000000..cc45401739 --- /dev/null +++ b/bin/commit_queue.py @@ -0,0 +1,334 @@ +# Options +# 1. Iterate through queue, if any have the checksum that is in the xml, copy to "real" location and delete from queue +# pros: should be the fastest since queue should be kept clean. +# cons: Will not detect any issues in "real" location. +# 2. Iterate through all Papers, check if checksummed file is in queue, copy if there is and remove from queue. +# pros: will detect if any resources are missing. +# cons: need to iterate through all papers everytime, will not catch case where extra files were copied. +# 3. Iterate through "real" location and checksum all files, if mismatched with paper, look at queue to copy in new file (e.g. a pdf or an attachment). +# pros: will detect if there are files in the "real" location that aren't referenced, and any resources that are referenced but have a different checksum (different version). +# cons: need to iterate through all files in "real" location and do checksum +# 4. Combine both #2 and #3 +# pros: will detect missing resources, extra resources and out dated resources +# cons: have to iterate all Papers, etc... in xml and checksum all files in the "real" location + + +from typing import List, Optional +import os +import click +import logging as log +from functools import partial +import subprocess + +from anthology import Anthology +from anthology.data import ANTHOLOGY_DATA_DIR, ResourceType +from anthology.utils import SeverityTracker, compute_hash_from_file + +# Enable show default by default +click.option = partial(click.option, show_default=True) + +# The root directory for files, currently containing pdf/ and attachments/ +ANTHOLOGY_FILE_ROOT = "/home/anthologizer/anthology-files" + +# The ssh shortcut (in ~/.ssh/config) or full hostname +ANTHOLOGY_HOST = "anth" + +# The remote url of the acl anthology git repo +REMOTE_URL = "https://github.com/acl-org/acl-anthology.git" + +# The main branch of the acl anthology git repo +REMOTE_MAIN_BRANCH_NAME = "master" + + +class ServerLocation: + REMOTE = 'remote' + LOCAL = 'local' + + +def is_clean_checkout_of_remote_branch( + repo_dir: str, remote_url: str, remote_main_branch_name: str +) -> bool: + # Check if repo is clean + status = ( + subprocess.check_output(["git", "status", "-uall", "--short"]) + .decode('utf-8') + .strip() + ) + if status: + log.debug( + f"Repo @ {repo_dir!r} is not clean. It has the following changes:\n{status}" + ) + return False + + # Check tracking url and branch + current_ref = ( + subprocess.check_output(["git", "symbolic-ref", "-q", "HEAD"]) + .decode('utf-8') + .strip() + ) + remote_tracking_branch_ref = subprocess.check_output( + ["git", "for-each-ref", "--format='%(upstream:short)'", current_ref] + ) + + if "/" not in remote_tracking_branch_ref: + msg = f"Invalid remote tracking branch ref {remote_tracking_branch_ref}" + log.error(msg) + raise Exception(msg) + + tracking_remote_name, remote_tracking_branch = remote_tracking_branch_ref.split( + '/', 1 + ) + + if remote_tracking_branch != remote_main_branch_name: + log.debug( + f"Remote tracking branch {remote_tracking_branch!r} is not main remote branch {remote_main_branch_name!r}" + ) + return False + + tracking_remote_url = ( + subprocess.check_output(["git", "remote", "get-url", tracking_remote_name]) + .decode('utf-8') + .strip() + ) + + if tracking_remote_url != remote_url: + log.debug( + f"Remote tracking url {tracking_remote_url!r} is not the remote url {remote_url!r}" + ) + return False + return True + + +def run_remote_command(cmd): + return subprocess.check_output(['ssh', 'anth', cmd]).decode('utf-8').strip() + + +class FileSystemOps: + def __init__(self, remote: bool, host: Optional[str], commit: bool): + self.remote = remote + self.host = host + self.commit = commit + if remote and not host: + raise Exception(f"If remote is true host is required but got host: {host!r}") + + self.root_dir = ANTHOLOGY_FILE_ROOT if remote else ANTHOLOGY_DATA_DIR + + def listdir(self, relative_path: str) -> List[str]: + abs_dir = f'{self.root_dir}/{relative_path}' + if self.remote: + return ( + subprocess.check_output(['ssh', self.host, f'ls {abs_dir}']) + .decode('utf-8') + .strip() + .split('\n') + ) + else: + return os.listdir(abs_dir) + + def movefile(self, relative_src_path: str, relative_dest_path: str): + abs_src = f'{self.root_dir}/{relative_src_path}' + abs_dest = f'{self.root_dir}/{relative_dest_path}' + + if self.remote: + cmd = ['ssh', self.host, f'mv {abs_src} {abs_dest}'] + if self.commit: + subprocess.check_call(cmd) + else: + log.info(f"Would run: {cmd}") + else: + if self.commit: + os.rename(abs_src, abs_dest) + else: + log.info(f"Would move file {abs_src!r} to {abs_dest!r}") + + def hashfile(self, relative_path: str) -> str: + abs_dir = f'{self.root_dir}/{relative_path}' + if self.remote: + return ( + subprocess.check_output(['ssh', self.host, f'crc32 {abs_dir}']) + .decode('utf-8') + .strip() + ) + else: + return compute_hash_from_file(abs_dir) + + def exists(self, relative_path: str) -> bool: + abs_dir = f'{self.root_dir}/{relative_path}' + if self.remote: + try: + subprocess.check_output(['ssh', self.host, f'stat {abs_dir}']) + return True + except subprocess.CalledProcessError: + return False + else: + return os.path.exists(abs_dir) + + def remove(self, relative_path: str) -> bool: + abs_dir = f'{self.root_dir}/{relative_path}' + if self.remote: + cmd = ['ssh', self.host, f'rm {abs_dir}'] + if self.commit: + subprocess.check_call(cmd) + else: + log.info(f"Would run: {cmd}") + else: + if self.commit: + os.remove(abs_dir) + else: + log.info(f"Would remove file {abs_dir!r}") + + +def process_queue(anth: Anthology, resource_type: ResourceType, fs: FileSystemOps): + queue_base_path = f'queue/{resource_type.value}' + for venue_name in fs.listdir(queue_base_path): + for filename in fs.listdir(os.path.join(queue_base_path, venue_name)): + base_filename, file_hash = filename.rsplit('.', 1) + + # Get main branch resource hash + try: + current_version_hash = anth.get_hash_for_resource( + anth, resource_type, base_filename + ) + except Exception as e: + log.error(f"{e} (filename: {filename!r})", exc_info=True) + continue + + if file_hash == current_version_hash: + log.info( + f"Found queued file matching hash: {os.path.join(queue_base_path, venue_name, filename)}" + ) + fs.movefile( + os.path.join(queue_base_path, venue_name, filename), + os.path.join(resource_type.value, venue_name, base_filename), + ) + + +def get_all_pdf_filepath_to_hash(anth: Anthology): + filepath_to_hash = {} + for _, paper in anth.papers.items(): + if paper.pdf is not None: + filepath = ( + f"{ResourceType.PDF.value}/{paper.collection_id}/{paper.full_id}.pdf" + ) + filepath_to_hash[filepath] = paper.pdf_hash + + return filepath_to_hash + + +def get_all_attachment_filepath_to_hash(anth: Anthology): + filepath_to_hash = {} + for _, paper in anth.papers.items(): + for attachment in paper.attachments: + filepath = f"{ResourceType.ATTACHMENT.value}/{paper.collection_id}/{attachment['filename']}" + filepath_to_hash[filepath] = attachment['hash'] + + return filepath_to_hash + + +def complete_check(anth: Anthology, resource_type: ResourceType, fs: FileSystemOps): + log.error("Complete check isn't implemented yet") + # get all hashes for files in server "live" directory + live_filepath_to_hash = {} + base_path = f'{resource_type.value}' + for venue_name in fs.listdir(base_path): + for filename in fs.listdir(os.path.join(base_path, venue_name)): + filepath = os.path.join(base_path, venue_name, filename) + live_filepath_to_hash[filepath] = fs.hashfile(filepath) + + expected_filepath_to_hash = { + ResourceType.ATTACHMENT: get_all_pdf_filepath_to_hash, + ResourceType.PDF: get_all_attachment_filepath_to_hash, + }[resource_type](anth) + + missing_files = set(expected_filepath_to_hash.keys() - live_filepath_to_hash.keys()) + extra_files = set(live_filepath_to_hash.keys() - expected_filepath_to_hash.keys()) + + out_dated_files = set() + common_files = set(expected_filepath_to_hash.keys() & live_filepath_to_hash.keys()) + for filepath in common_files: + if expected_filepath_to_hash[filepath] is None: + log.error(f'Missing expected_file_hash for {filepath}') + continue + if expected_filepath_to_hash[filepath] != live_filepath_to_hash[filepath]: + out_dated_files.add(filepath) + + files_to_move_in = missing_files | out_dated_files + for filepath in files_to_move_in: + expected_file_hash = expected_filepath_to_hash[filepath] + if expected_file_hash is None: + log.error(f'Missing expected_file_hash for {filepath}') + continue + queue_file_path = f'queue/{filepath}.{expected_file_hash}' + if fs.exists(queue_file_path): + fs.movefile(queue_file_path, filepath) + else: + log.error(f'Missing file in queue: {queue_file_path}') + + for filepath in extra_files: + fs.remove(filepath) + + +@click.command() +@click.option( + '-i', + '--importdir', + type=click.Path(exists=True), + default=ANTHOLOGY_DATA_DIR, + help="Directory to import the Anthology XML files data files from.", +) +@click.option( + '--server-location', + required=True, + type=click.Choice( + [ServerLocation.REMOTE, ServerLocation.LOCAL], case_sensitive=False + ), +) +@click.option( + '-c', + '--commit', + is_flag=True, + help="Commit (=write) the changes to the anthology server; will only do a dry run otherwise.", +) +@click.option( + '--complete-check', is_flag=True, help="Do a complete check of resources on server." +) +@click.option('--debug', is_flag=True, help="Output debug-level log messages.") +def main( + importdir: str, + server_location: str, + remote: bool, + commit: str, + complete_check: bool, + debug: bool, +): + log_level = log.DEBUG if debug else log.INFO + log.basicConfig(format="%(levelname)-8s %(message)s", level=log_level) + tracker = SeverityTracker() + log.getLogger().addHandler(tracker) + + log.info(f"Remote {remote}") + + if server_location != ServerLocation.REMOTE: + log.error("Running this script locally on the server isn't supported yet!") + exit(1) + + # if not is_clean_checkout_of_remote_branch(importdir, REMOTE_URL, REMOTE_MAIN_BRANCH_NAME): + # log.error(f"Repo @ {importdir} isn't clean or isn't tracking the master remote branch.") + + log.info("Instantiating the Anthology...") + anth = Anthology(importdir=importdir) + + fs = FileSystemOps(remote=remote, host=ANTHOLOGY_HOST, commit=commit) + + if complete_check: + complete_check() + else: + process_queue(anth, resource_type=ResourceType.PDF, fs=fs) + process_queue(anth, resource_type=ResourceType.ATTACHMENT, fs=fs) + + if tracker.highest >= log.ERROR: + exit(1) + + +if __name__ == "__main__": + main() From 3c381a9964a76c217170200e43c66ce150060ec6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xinru=20Yan=20=F0=9F=87=A8=F0=9F=87=B3=20=20=F0=9F=91=91?= Date: Mon, 28 Mar 2022 12:16:08 -0700 Subject: [PATCH 7/9] minor update during testing --- bin/commit_queue.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bin/commit_queue.py b/bin/commit_queue.py index cc45401739..dc85032957 100644 --- a/bin/commit_queue.py +++ b/bin/commit_queue.py @@ -183,7 +183,7 @@ def process_queue(anth: Anthology, resource_type: ResourceType, fs: FileSystemOp for venue_name in fs.listdir(queue_base_path): for filename in fs.listdir(os.path.join(queue_base_path, venue_name)): base_filename, file_hash = filename.rsplit('.', 1) - + # Get main branch resource hash try: current_version_hash = anth.get_hash_for_resource( @@ -283,6 +283,11 @@ def complete_check(anth: Anthology, resource_type: ResourceType, fs: FileSystemO [ServerLocation.REMOTE, ServerLocation.LOCAL], case_sensitive=False ), ) +@click.option( + '--remote', + required=True, + default=True, +) @click.option( '-c', '--commit', @@ -314,6 +319,7 @@ def main( # if not is_clean_checkout_of_remote_branch(importdir, REMOTE_URL, REMOTE_MAIN_BRANCH_NAME): # log.error(f"Repo @ {importdir} isn't clean or isn't tracking the master remote branch.") + # can not be tested since code is not on master yet log.info("Instantiating the Anthology...") anth = Anthology(importdir=importdir) From 04194ff1dfec3681fc4eca61fbf4c8a1b9fef8ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xinru=20Yan=20=F0=9F=87=A8=F0=9F=87=B3=20=20=F0=9F=91=91?= Date: Thu, 7 Apr 2022 19:08:24 -0700 Subject: [PATCH 8/9] minor fix --- bin/commit_queue.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/commit_queue.py b/bin/commit_queue.py index dc85032957..cb712caa1f 100644 --- a/bin/commit_queue.py +++ b/bin/commit_queue.py @@ -183,7 +183,7 @@ def process_queue(anth: Anthology, resource_type: ResourceType, fs: FileSystemOp for venue_name in fs.listdir(queue_base_path): for filename in fs.listdir(os.path.join(queue_base_path, venue_name)): base_filename, file_hash = filename.rsplit('.', 1) - + # Get main branch resource hash try: current_version_hash = anth.get_hash_for_resource( From d159d3f9b48faa843cd2f32deac30e4a23848569 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xinru=20Yan=20=F0=9F=87=A8=F0=9F=87=B3=20=20=F0=9F=91=91?= Date: Thu, 7 Apr 2022 22:07:43 -0700 Subject: [PATCH 9/9] major fixes --- bin/commit_queue.py | 210 +++++++++++++++++--------------------------- 1 file changed, 80 insertions(+), 130 deletions(-) diff --git a/bin/commit_queue.py b/bin/commit_queue.py index cb712caa1f..35cc9f3ae7 100644 --- a/bin/commit_queue.py +++ b/bin/commit_queue.py @@ -11,6 +11,7 @@ # 4. Combine both #2 and #3 # pros: will detect missing resources, extra resources and out dated resources # cons: have to iterate all Papers, etc... in xml and checksum all files in the "real" location +# Picking option #1 as default and #4 as a complete check (will implement in future) from typing import List, Optional @@ -40,11 +41,6 @@ REMOTE_MAIN_BRANCH_NAME = "master" -class ServerLocation: - REMOTE = 'remote' - LOCAL = 'local' - - def is_clean_checkout_of_remote_branch( repo_dir: str, remote_url: str, remote_main_branch_name: str ) -> bool: @@ -104,90 +100,109 @@ def run_remote_command(cmd): class FileSystemOps: - def __init__(self, remote: bool, host: Optional[str], commit: bool): - self.remote = remote + def __init__(self, is_on_server: bool, host: Optional[str], commit: bool): + self.is_on_server = is_on_server self.host = host self.commit = commit - if remote and not host: - raise Exception(f"If remote is true host is required but got host: {host!r}") + if not is_on_server and not host: + raise Exception( + f"If is_on_server is false, host is required but got host: {host!r}" + ) - self.root_dir = ANTHOLOGY_FILE_ROOT if remote else ANTHOLOGY_DATA_DIR + self.root_dir = ANTHOLOGY_DATA_DIR if is_on_server else ANTHOLOGY_FILE_ROOT def listdir(self, relative_path: str) -> List[str]: abs_dir = f'{self.root_dir}/{relative_path}' - if self.remote: - return ( - subprocess.check_output(['ssh', self.host, f'ls {abs_dir}']) - .decode('utf-8') - .strip() - .split('\n') - ) - else: + if self.is_on_server: return os.listdir(abs_dir) + return ( + subprocess.check_output(['ssh', self.host, f'ls {abs_dir}']) + .decode('utf-8') + .strip() + .split('\n') + ) def movefile(self, relative_src_path: str, relative_dest_path: str): abs_src = f'{self.root_dir}/{relative_src_path}' abs_dest = f'{self.root_dir}/{relative_dest_path}' + abs_dest_dir = os.path.dirname(abs_dest) - if self.remote: - cmd = ['ssh', self.host, f'mv {abs_src} {abs_dest}'] + if self.is_on_server: if self.commit: - subprocess.check_call(cmd) + os.makedirs(abs_dest_dir, exist_ok=True) else: - log.info(f"Would run: {cmd}") - else: + log.info(f"Would super-mkdir {abs_dest_dir!r}") if self.commit: os.rename(abs_src, abs_dest) else: log.info(f"Would move file {abs_src!r} to {abs_dest!r}") + return + mdkir_cmd = [ + 'ssh', + ANTHOLOGY_HOST, + f'mkdir -p {abs_dest_dir}', + ] + if self.commit: + subprocess.check_call(mdkir_cmd) + else: + log.info(f"Would run: {mdkir_cmd}") + + cmd = ['ssh', self.host, f'mv {abs_src} {abs_dest}'] + if self.commit: + subprocess.check_call(cmd) + else: + log.info(f"Would run: {cmd}") def hashfile(self, relative_path: str) -> str: abs_dir = f'{self.root_dir}/{relative_path}' - if self.remote: - return ( - subprocess.check_output(['ssh', self.host, f'crc32 {abs_dir}']) - .decode('utf-8') - .strip() - ) - else: + if self.is_on_server: return compute_hash_from_file(abs_dir) + return ( + subprocess.check_output(['ssh', self.host, f'crc32 {abs_dir}']) + .decode('utf-8') + .strip() + ) def exists(self, relative_path: str) -> bool: abs_dir = f'{self.root_dir}/{relative_path}' - if self.remote: - try: - subprocess.check_output(['ssh', self.host, f'stat {abs_dir}']) - return True - except subprocess.CalledProcessError: - return False - else: + if self.is_on_server: return os.path.exists(abs_dir) + try: + subprocess.check_output(['ssh', self.host, f'stat {abs_dir}']) + return True + except subprocess.CalledProcessError: + return False - def remove(self, relative_path: str) -> bool: + def remove(self, relative_path: str): abs_dir = f'{self.root_dir}/{relative_path}' - if self.remote: - cmd = ['ssh', self.host, f'rm {abs_dir}'] - if self.commit: - subprocess.check_call(cmd) - else: - log.info(f"Would run: {cmd}") - else: + if self.is_on_server: if self.commit: os.remove(abs_dir) else: log.info(f"Would remove file {abs_dir!r}") + return + cmd = ['ssh', self.host, f'rm {abs_dir}'] + if self.commit: + subprocess.check_call(cmd) + else: + log.info(f"Would run: {cmd}") def process_queue(anth: Anthology, resource_type: ResourceType, fs: FileSystemOps): + log.debug(f'Processing queue for {resource_type}') queue_base_path = f'queue/{resource_type.value}' + if not fs.exists(queue_base_path): + log.error(f'Missing queue directory: {queue_base_path}.') + return for venue_name in fs.listdir(queue_base_path): for filename in fs.listdir(os.path.join(queue_base_path, venue_name)): + log.debug(f'\tProcessing file {filename!r}') base_filename, file_hash = filename.rsplit('.', 1) # Get main branch resource hash try: current_version_hash = anth.get_hash_for_resource( - anth, resource_type, base_filename + resource_type, base_filename ) except Exception as e: log.error(f"{e} (filename: {filename!r})", exc_info=True) @@ -203,69 +218,8 @@ def process_queue(anth: Anthology, resource_type: ResourceType, fs: FileSystemOp ) -def get_all_pdf_filepath_to_hash(anth: Anthology): - filepath_to_hash = {} - for _, paper in anth.papers.items(): - if paper.pdf is not None: - filepath = ( - f"{ResourceType.PDF.value}/{paper.collection_id}/{paper.full_id}.pdf" - ) - filepath_to_hash[filepath] = paper.pdf_hash - - return filepath_to_hash - - -def get_all_attachment_filepath_to_hash(anth: Anthology): - filepath_to_hash = {} - for _, paper in anth.papers.items(): - for attachment in paper.attachments: - filepath = f"{ResourceType.ATTACHMENT.value}/{paper.collection_id}/{attachment['filename']}" - filepath_to_hash[filepath] = attachment['hash'] - - return filepath_to_hash - - -def complete_check(anth: Anthology, resource_type: ResourceType, fs: FileSystemOps): +def do_complete_check(anth: Anthology, resource_type: ResourceType, fs: FileSystemOps): log.error("Complete check isn't implemented yet") - # get all hashes for files in server "live" directory - live_filepath_to_hash = {} - base_path = f'{resource_type.value}' - for venue_name in fs.listdir(base_path): - for filename in fs.listdir(os.path.join(base_path, venue_name)): - filepath = os.path.join(base_path, venue_name, filename) - live_filepath_to_hash[filepath] = fs.hashfile(filepath) - - expected_filepath_to_hash = { - ResourceType.ATTACHMENT: get_all_pdf_filepath_to_hash, - ResourceType.PDF: get_all_attachment_filepath_to_hash, - }[resource_type](anth) - - missing_files = set(expected_filepath_to_hash.keys() - live_filepath_to_hash.keys()) - extra_files = set(live_filepath_to_hash.keys() - expected_filepath_to_hash.keys()) - - out_dated_files = set() - common_files = set(expected_filepath_to_hash.keys() & live_filepath_to_hash.keys()) - for filepath in common_files: - if expected_filepath_to_hash[filepath] is None: - log.error(f'Missing expected_file_hash for {filepath}') - continue - if expected_filepath_to_hash[filepath] != live_filepath_to_hash[filepath]: - out_dated_files.add(filepath) - - files_to_move_in = missing_files | out_dated_files - for filepath in files_to_move_in: - expected_file_hash = expected_filepath_to_hash[filepath] - if expected_file_hash is None: - log.error(f'Missing expected_file_hash for {filepath}') - continue - queue_file_path = f'queue/{filepath}.{expected_file_hash}' - if fs.exists(queue_file_path): - fs.movefile(queue_file_path, filepath) - else: - log.error(f'Missing file in queue: {queue_file_path}') - - for filepath in extra_files: - fs.remove(filepath) @click.command() @@ -277,16 +231,9 @@ def complete_check(anth: Anthology, resource_type: ResourceType, fs: FileSystemO help="Directory to import the Anthology XML files data files from.", ) @click.option( - '--server-location', - required=True, - type=click.Choice( - [ServerLocation.REMOTE, ServerLocation.LOCAL], case_sensitive=False - ), -) -@click.option( - '--remote', - required=True, - default=True, + '--is-on-server', + is_flag=True, + help="If this flag is set file system changes will be applied to the local file system, else changes will be made by sshing into the anth server.", ) @click.option( '-c', @@ -300,8 +247,7 @@ def complete_check(anth: Anthology, resource_type: ResourceType, fs: FileSystemO @click.option('--debug', is_flag=True, help="Output debug-level log messages.") def main( importdir: str, - server_location: str, - remote: bool, + is_on_server: bool, commit: str, complete_check: bool, debug: bool, @@ -311,23 +257,27 @@ def main( tracker = SeverityTracker() log.getLogger().addHandler(tracker) - log.info(f"Remote {remote}") - - if server_location != ServerLocation.REMOTE: - log.error("Running this script locally on the server isn't supported yet!") - exit(1) + log.info( + 'Running as if on server.' + if is_on_server + else 'Will ssh to server for file system operations.' + ) - # if not is_clean_checkout_of_remote_branch(importdir, REMOTE_URL, REMOTE_MAIN_BRANCH_NAME): - # log.error(f"Repo @ {importdir} isn't clean or isn't tracking the master remote branch.") - # can not be tested since code is not on master yet + if not is_clean_checkout_of_remote_branch( + importdir, REMOTE_URL, REMOTE_MAIN_BRANCH_NAME + ): + log.error( + f"Repo @ {importdir} isn't clean or isn't tracking the master remote branch." + ) log.info("Instantiating the Anthology...") anth = Anthology(importdir=importdir) - fs = FileSystemOps(remote=remote, host=ANTHOLOGY_HOST, commit=commit) + fs = FileSystemOps(is_on_server=is_on_server, host=ANTHOLOGY_HOST, commit=commit) if complete_check: - complete_check() + do_complete_check(anth, resource_type=ResourceType.PDF, fs=fs) + do_complete_check(anth, resource_type=ResourceType.ATTACHMENT, fs=fs) else: process_queue(anth, resource_type=ResourceType.PDF, fs=fs) process_queue(anth, resource_type=ResourceType.ATTACHMENT, fs=fs)