Skip to content

Commit 2e1c728

Browse files
committed
Added enqueue_files script, step 1 for #1818
1 parent 6765599 commit 2e1c728

File tree

2 files changed

+175
-0
lines changed

2 files changed

+175
-0
lines changed

bin/anthology/data.py

+19
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,25 @@
5454
"ANTHOLOGY_FILES", os.path.join(os.environ["HOME"], "anthology-files")
5555
)
5656

57+
# Anthology pdf location
58+
# Defaults to {ANTHOLOGY_FILE_DIR}/pdf
59+
ANTHOLOGY_PDF_DIR = os.environ.get(
60+
"ANTHOLOGY_PDFS", os.path.join(ANTHOLOGY_FILE_DIR, "pdf")
61+
)
62+
63+
# Anthology attachments location
64+
# Defaults to {ANTHOLOGY_FILE_DIR}/attachments
65+
ANTHOLOGY_ATTACHMENTS_DIR = os.environ.get(
66+
"ANTHOLOGY_ATTACHMENTS", os.path.join(ANTHOLOGY_FILE_DIR, "attachments")
67+
)
68+
69+
# Anthology data location
70+
# Defaults to {git_repo_root}/data
71+
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
72+
ANTHOLOGY_DATA_DIR = os.environ.get(
73+
"ANTHOLOGY_DATA", os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "..", "data"))
74+
)
75+
5776
# Names of XML elements that may appear multiple times
5877
LIST_ELEMENTS = (
5978
"attachment",

bin/enqueue_files.py

+156
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
#
4+
# Copyright 2022 Xinru Yan <xinru1414@gmail.com>
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
import os
19+
import click
20+
import logging as log
21+
from enum import Enum
22+
from functools import partial
23+
24+
from anthology import Anthology
25+
from anthology.utils import upload_file_to_queue
26+
from anthology.utils import SeverityTracker
27+
from anthology.data import (
28+
ANTHOLOGY_ATTACHMENTS_DIR,
29+
ANTHOLOGY_DATA_DIR,
30+
ANTHOLOGY_PDF_DIR,
31+
ResourceType,
32+
)
33+
34+
35+
# Enable show default by default
36+
click.option = partial(click.option, show_default=True)
37+
38+
39+
def get_proceedings_id_from_filename(resource_type: ResourceType, filename: str) -> str:
40+
trailing_dots = {ResourceType.PDF: 1, ResourceType.ATTACHMENT: 2}[resource_type]
41+
return filename.rsplit('.', trailing_dots)[0]
42+
43+
44+
def get_hash_for_resource(
45+
anth: Anthology, resource_type: ResourceType, filename: str
46+
) -> str:
47+
proceedings_id = get_proceedings_id_from_filename(resource_type, filename)
48+
if proceedings_id not in anth.papers and proceedings_id not in anth.volumes:
49+
raise Exception(f"Paper/Volume for PDF {proceedings_id!r} does not exist.")
50+
51+
resource_hash = None
52+
if resource_type == ResourceType.PDF:
53+
resource_hash = anth.papers.get(
54+
proceedings_id, anth.volumes.get(proceedings_id)
55+
).pdf_hash
56+
elif resource_type == ResourceType.ATTACHMENT:
57+
attachments = anth.papers[proceedings_id].attachments
58+
filename_to_hash = {a['filename']: a['hash'] for a in attachments}
59+
resource_hash = filename_to_hash.get(filename)
60+
61+
if resource_hash is None:
62+
raise Exception(
63+
"Hash for resource is None. Please update with value before running this script."
64+
)
65+
66+
return resource_hash
67+
68+
69+
# Iterate over files in resource directory, find the hash in the Anthology and upload the file (if commit)
70+
def enqueue_dir(
71+
anth: Anthology,
72+
resource_directory: str,
73+
resource_type: ResourceType,
74+
commit: bool = False,
75+
):
76+
for venue_name in os.listdir(resource_directory):
77+
for filename in os.listdir(os.path.join(resource_directory, venue_name)):
78+
local_path = os.path.join(resource_directory, venue_name, filename)
79+
80+
# Get resource hash
81+
try:
82+
resource_hash = get_hash_for_resource(anth, resource_type, filename)
83+
except Exception as e:
84+
log.error(f"{e} (filename: {local_path!r})", exc_info=True)
85+
continue
86+
87+
upload_file_to_queue(
88+
local_path,
89+
resource_type=resource_type,
90+
venue_name=venue_name,
91+
filename=filename,
92+
file_hash=resource_hash,
93+
commit=commit,
94+
)
95+
96+
97+
@click.command()
98+
@click.option(
99+
'-i',
100+
'--importdir',
101+
type=click.Path(exists=True),
102+
default=ANTHOLOGY_DATA_DIR,
103+
help="Directory to import the Anthology XML files data files from.",
104+
)
105+
@click.option(
106+
'-p',
107+
'--pdfs-dir',
108+
type=click.Path(exists=True),
109+
default=ANTHOLOGY_PDF_DIR,
110+
help="Root path for placement of PDF files",
111+
)
112+
@click.option(
113+
'-a',
114+
'--attachments-dir',
115+
type=click.Path(exists=True),
116+
default=ANTHOLOGY_ATTACHMENTS_DIR,
117+
help="Root path for placement of PDF files",
118+
)
119+
@click.option(
120+
'-c',
121+
'--commit',
122+
is_flag=True,
123+
help="Commit (=write) the changes to the anthology server; will only do a dry run otherwise.",
124+
)
125+
@click.option('--debug', is_flag=True, help="Output debug-level log messages.")
126+
def main(importdir, pdfs_dir, attachments_dir, commit, debug):
127+
log_level = log.DEBUG if debug else log.INFO
128+
log.basicConfig(format="%(levelname)-8s %(message)s", level=log_level)
129+
tracker = SeverityTracker()
130+
log.getLogger().addHandler(tracker)
131+
132+
log.info("Instantiating the Anthology...")
133+
anth = Anthology(importdir=importdir)
134+
135+
log.info("Enqueuing PDFs...")
136+
enqueue_dir(anth, pdfs_dir, ResourceType.PDF, commit)
137+
138+
log.info("Enqueuing Attachments...")
139+
enqueue_dir(anth, attachments_dir, ResourceType.ATTACHMENT, commit)
140+
141+
if not commit:
142+
if tracker.highest >= log.ERROR:
143+
log.warning(
144+
"There were errors! Please check them carefully before re-running this script with -c/--commit."
145+
)
146+
else:
147+
log.warning(
148+
"Re-run this script with -c/--commit to upload these files to the server."
149+
)
150+
151+
if tracker.highest >= log.ERROR:
152+
exit(1)
153+
154+
155+
if __name__ == "__main__":
156+
main()

0 commit comments

Comments
 (0)