-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain_merger
433 lines (358 loc) · 16.7 KB
/
main_merger
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
"""
PDF_TOC_Generator_with_Header_Footer_v1.py
Version: 1.0 - Date: 08-Aug-2024
Authors: George C. Cardoso; ChatGPT
Email: gcc@usp.br
GitHub: https://github.com/Photobiomedical-Instrumentation-Group
Description:
This script processes PDF files by adding headers, footers, and links, and generates a Table of Contents (TOC) for a collection of submissions.
It has been tested and verified to work well; see results from 10/Aug at 3:51 PM.
Note: The script still needs to include "link" text next to poster/oral.
Dependencies:
- PyPDF2
- reportlab
- PIL (Pillow)
- pandas
- Google Colab (files module)
In google colab, first run:
from google.colab import drive
drive.mount('/content/drive')
!pip install PyPDF2 reportlab
!pip install --upgrade PyPDF2 reportlab Pillow
!pip install --upgrade PyPDF2 reportlab Pillow openpyxl
Usage:
Ensure all required dependencies are installed.
Needs:
- Excel spreadsheet filled out with filenames (with or without .pdf), titles, authors, and type of presentation.
- PDF files with filenames identical to the ones in the spreadsheet (attention: case-sensitive!).
- All filenames are case-sensitive (including the extensions .jpg, .pdf, etc.)!
- Filenames must have no spaces. Use "File_One.pdf" instead of "File One.pdf" (this requirement has been updated below).
"""
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.generic import NameObject, DictionaryObject, ArrayObject, NumberObject, IndirectObject
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4, inch
from reportlab.lib.units import inch
from PIL import Image as PILImage
from io import BytesIO
import os
import re
import random
import pandas as pd
from google.colab import files
# Global variables for formatting
PAGE_SIZE = A4
LEFT_MARGIN = inch
RIGHT_MARGIN = 0.5 * inch
HEADER_IMAGE_RESIZE_FACTOR = 0.5
HEADER_IMAGE_HEIGHT = 1.0 * inch # Control the height of the header image
TOC_TITLE_FONT_SIZE = 14
TOC_FONT_SIZE = 10
FOOTER_FONT_SIZE = 10 # Font size for the footer text
HEADER_VERT_DISPLACE = 0.5 * inch # Lower values mean higher header
TOC_LOWER_MARGIN = 1 * inch # Lower margin for the TOC
# Global font settings
FONT_TYPE = "Helvetica"
FOOTER_TEXT = "CBEB Ribeirão Preto 2024. Click here to go to the Table of Contents"
def preprocess_authors(authors):
"""Cleans and formats the author names by capitalizing each word and removing unnecessary numbers and commas."""
authors = re.sub(r'\d+', '', authors)
authors = ' '.join(word.capitalize() for word in authors.split())
author_list = authors.split(',')
author_list = [author.strip() for author in author_list if author.strip()]
authors = ', '.join(author_list)
return authors
def normalize_filename(filename):
"""Checks if the filename has a .pdf extension; if not, adds .pdf."""
if not filename.lower().endswith('.pdf'):
filename += '.pdf'
return filename
def add_header_footer(page, header_image_path, footer_text, page_num, toc_page_num=None):
"""Adds a header image and footer text with a page number to the given PDF page.
If toc_page_num is provided, both the header image and the footer text will be clickable and link back to the TOC.
"""
try:
img = PILImage.open(header_image_path)
img.verify()
except Exception as e:
print(f"Error opening header image: {e}")
return
packet = BytesIO()
can = canvas.Canvas(packet, pagesize=PAGE_SIZE)
width, height = PAGE_SIZE
try:
img_width, img_height = img.size
new_width = img_width * HEADER_IMAGE_RESIZE_FACTOR
new_height = HEADER_IMAGE_HEIGHT
can.drawImage(header_image_path, LEFT_MARGIN, height - HEADER_VERT_DISPLACE - new_height,
width=new_width, height=new_height)
except Exception as e:
print(f"Error drawing header image: {e}")
return
# Footer text with the page number right-aligned
can.setFont(FONT_TYPE, FOOTER_FONT_SIZE)
can.drawString(LEFT_MARGIN, 30, footer_text)
# Calculate the width of the page number text
page_number_text = f"Page {page_num}"
page_number_text_width = can.stringWidth(page_number_text, FONT_TYPE, FOOTER_FONT_SIZE)
# Draw the page number right-aligned at the right margin
can.drawString(width - RIGHT_MARGIN - page_number_text_width, 30, page_number_text)
can.save()
packet.seek(0)
overlay_pdf = PdfReader(packet)
overlay_page = overlay_pdf.pages[0]
page.merge_page(overlay_page)
if toc_page_num is not None:
add_link_to_header_footer(page, width, height, toc_page_num)
def add_link_to_header_footer(page, page_width, page_height, toc_page_num):
"""Adds link annotations to both the header image and the footer text that link back to the TOC page."""
# Add link to the header
header_annotation = DictionaryObject()
header_annotation.update({
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Link"),
NameObject("/Rect"): ArrayObject([
NumberObject(LEFT_MARGIN),
NumberObject(page_height - HEADER_VERT_DISPLACE - HEADER_IMAGE_HEIGHT),
NumberObject(LEFT_MARGIN + HEADER_IMAGE_RESIZE_FACTOR * page_width),
NumberObject(page_height - HEADER_VERT_DISPLACE)
]),
NameObject("/Border"): ArrayObject([NumberObject(0), NumberObject(0), NumberObject(0)]),
NameObject("/A"): DictionaryObject({
NameObject("/S"): NameObject("/GoTo"),
NameObject("/D"): ArrayObject([NumberObject(toc_page_num),
NameObject("/XYZ"),
NumberObject(0),
NumberObject(842),
NumberObject(0)])
})
})
# Add link to the footer
footer_annotation = DictionaryObject()
footer_annotation.update({
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Link"),
NameObject("/Rect"): ArrayObject([
NumberObject(LEFT_MARGIN),
NumberObject(20),
NumberObject(page_width - RIGHT_MARGIN),
NumberObject(40)
]),
NameObject("/Border"): ArrayObject([NumberObject(0), NumberObject(0), NumberObject(0)]),
NameObject("/A"): DictionaryObject({
NameObject("/S"): NameObject("/GoTo"),
NameObject("/D"): ArrayObject([NumberObject(toc_page_num),
NameObject("/XYZ"),
NumberObject(0),
NumberObject(842),
NumberObject(0)])
})
})
if "/Annots" in page:
page[NameObject("/Annots")].extend([header_annotation, footer_annotation])
else:
page[NameObject("/Annots")] = ArrayObject([header_annotation, footer_annotation])
def create_pdf_with_header_footer(input_pdf_path, header_image_path, footer_text, start_page_num, toc_page_num):
"""Applies header and footer to each page of the PDF and returns a PdfWriter object."""
reader = PdfReader(input_pdf_path)
writer = PdfWriter()
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
add_header_footer(page, header_image_path, footer_text, start_page_num + page_num, toc_page_num)
writer.add_page(page)
return writer
def generate_toc_pdf(toc_entries, header_image_path):
"""Generates a Table of Contents (TOC) PDF with links to the corresponding pages in the main document."""
buffer = BytesIO()
can = canvas.Canvas(buffer, pagesize=PAGE_SIZE)
links = []
width, height = PAGE_SIZE
margin = LEFT_MARGIN
y_position = height - HEADER_VERT_DISPLACE - HEADER_IMAGE_HEIGHT - 40 # Adjusted for header image and title
line_height = 15
entry_spacing = 10
# Add header image
try:
img = PILImage.open(header_image_path)
img.verify()
img_width, img_height = img.size
new_width = width - 2 * margin # Adjust to fit within margins
new_height = (img_height * new_width) / img_width # Maintain aspect ratio
can.drawImage(header_image_path, margin, y_position + 50, width=new_width, height=new_height)
y_position -= new_height + 20
except Exception as e:
print(f"Error drawing header image on TOC page: {e}")
# Add "Table of Contents" title
can.setFont(FONT_TYPE, TOC_TITLE_FONT_SIZE)
can.drawString(margin, y_position, "Table of Contents")
y_position -= 40
can.setFont(FONT_TYPE, TOC_FONT_SIZE)
for entry in toc_entries:
title, author, presentation_type, page_num = entry
link_text = f"({presentation_type}) {title}, by {author}"
# Handle text wrapping
words = link_text.split()
current_line = ""
lines = []
for word in words:
if can.stringWidth(current_line + " " + word, FONT_TYPE, TOC_FONT_SIZE) < (width - 2 * margin - 100):
current_line += " " + word
else:
lines.append(current_line.strip())
current_line = word
lines.append(current_line.strip())
# Draw each line of text and add links to the first line
for i, line in enumerate(lines):
can.drawString(margin, y_position, line)
if i == 0:
text_width = can.stringWidth(line, FONT_TYPE, TOC_FONT_SIZE)
links.append((margin, y_position, margin + text_width, y_position + line_height, page_num))
y_position -= line_height
# Draw the page number on the same line as the last line of the title
page_number_text = f"Page {page_num}"
can.drawRightString(width - margin, y_position + line_height, page_number_text)
y_position -= entry_spacing
if y_position < TOC_LOWER_MARGIN:
can.showPage()
y_position = height - HEADER_VERT_DISPLACE - HEADER_IMAGE_HEIGHT - 40 # Reset for the new page
can.setFont(FONT_TYPE, TOC_FONT_SIZE) # Ensure consistent font settings on the new page
can.save()
buffer.seek(0)
toc_reader = PdfReader(buffer)
# Add links to the TOC with the corrected page destination
for i, (x1, y1, x2, y2, page_num) in enumerate(links):
annotation = DictionaryObject()
annotation.update({
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Link"),
NameObject("/Rect"): ArrayObject([NumberObject(x1), NumberObject(y1), NumberObject(x2), NumberObject(y2)]),
NameObject("/Border"): ArrayObject([NumberObject(0), NumberObject(0), NumberObject(0)]),
NameObject("/A"): DictionaryObject({
NameObject("/S"): NameObject("/GoTo"),
NameObject("/D"): ArrayObject([NumberObject(page_num + len(toc_reader.pages) - 1),
NameObject("/XYZ"),
NumberObject(0),
NumberObject(842),
NumberObject(0)])
})
})
toc_page = toc_reader.pages[0]
if "/Annots" in toc_page:
toc_page[NameObject("/Annots")].append(annotation)
else:
toc_page[NameObject("/Annots")] = ArrayObject([annotation])
return toc_reader, links
def extract_file_num(filename):
"""Extracts the numeric part of a filename for ordering purposes."""
match = re.match(r"(\d+)_.*\.pdf", filename, re.IGNORECASE)
if match:
return int(match.group(1))
else:
raise ValueError(f"Filename {filename} does not match the expected format.")
def count_pdf_files(folder):
"""Counts the number of PDF files in the specified folder."""
return len([f for f in os.listdir(folder) if f.lower().endswith('.pdf')])
def reorder_first_n_pages_with_links(input_pdf_path, page_order):
"""Reorders the first N pages of a PDF and re-applies links to match the new page order."""
reader = PdfReader(input_pdf_path)
writer = PdfWriter()
# Extract pages and collect links
pages = [page for page in reader.pages]
links = []
for page_num, page in enumerate(pages):
if "/Annots" in page:
annotations = page[NameObject("/Annots")]
for annotation in annotations:
link = annotation.get_object()
if link.get("/Subtype") == "/Link":
if "/A" in link and "/D" in link["/A"]:
links.append((page_num, link))
# Reorder the first N pages
n = len(page_order)
reordered_pages = [pages[i] for i in page_order]
reordered_pages.extend(pages[n:])
# Add reordered pages to the writer
for page in reordered_pages:
writer.add_page(page)
# Reapply the links to the reordered pages
for page_num, link in links:
if page_num < n:
new_page_num = page_order.index(page_num)
else:
new_page_num = page_num
link_dest = link["/A"]["/D"]
if isinstance(link_dest[0], NumberObject):
link_dest[0] = NumberObject(new_page_num)
annotations = reordered_pages[new_page_num][NameObject("/Annots")]
annotations.append(link)
output_pdf_path = input_pdf_path.replace(".pdf", "_reordered_with_links.pdf")
with open(output_pdf_path, "wb") as f:
writer.write(f)
return output_pdf_path
def main():
content_folder = "/content/drive/MyDrive/Colab Notebooks/CBEB-pdfs/PDF3"
header_image_path = f"{content_folder}/header_image.jpg"
output_dir = "processed_submissions"
os.makedirs(output_dir, exist_ok=True)
table_path = f"{content_folder}/TabelaTrabalhos4colsV2.xlsx"
df = pd.read_excel(table_path)
expected_columns = {'PDF Filename', 'Title', 'Authors', 'Oral/Poster'}
if not expected_columns.issubset(df.columns):
raise KeyError(f"Expected columns {expected_columns}, but found {df.columns}")
df = df.dropna()
df = df.astype({"PDF Filename": str, "Title": str, "Authors": str, "Oral/Poster": str})
toc_entries = []
combined_writer = PdfWriter()
page_number = 1
for _, row in df.iterrows():
filename = normalize_filename(row['PDF Filename'][:1000]) # Ensure the filename ends with .pdf
input_file = os.path.join(content_folder, filename)
if not os.path.exists(input_file):
print(f"File {input_file} not found. Skipping.")
continue
title = row['Title']
author = preprocess_authors(row['Authors'])
presentation_type = row['Oral/Poster']
print(f"Processing {input_file}")
writer = create_pdf_with_header_footer(input_file, header_image_path, FOOTER_TEXT, page_number, toc_page_num=0)
num_pages = len(writer.pages)
toc_entries.append((title, author, presentation_type, page_number))
page_number += num_pages
for page in writer.pages:
combined_writer.add_page(page)
toc_reader, links = generate_toc_pdf(toc_entries, header_image_path)
for page in toc_reader.pages:
combined_writer.insert_page(page, 0)
toc_page_count = len(toc_reader.pages)
for (x1, y1, x2, y2, page_num) in links:
annotation = DictionaryObject()
annotation.update({
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Link"),
NameObject("/Rect"): ArrayObject([NumberObject(x1), NumberObject(y1), NumberObject(x2), NumberObject(y2)]),
NameObject("/Border"): ArrayObject([NumberObject(0), NumberObject(0), NumberObject(0)]),
NameObject("/A"): DictionaryObject({
NameObject("/S"): NameObject("/GoTo"),
NameObject("/D"): ArrayObject([NumberObject(page_num + toc_page_count - 1),
NameObject("/XYZ"),
NumberObject(0),
NumberObject(842),
NumberObject(0)])
})
})
toc_page = combined_writer.pages[0]
if "/Annots" in toc_page:
toc_page[NameObject("/Annots")].append(annotation)
else:
toc_page[NameObject("/Annots")] = ArrayObject([annotation])
final_output_path = f"MergedPDFwithTOC_{random.randint(10000, 99999)}.pdf"
with open(final_output_path, "wb") as f:
combined_writer.write(f)
print("Final PDF created successfully.")
page_order = [1, 0, 2] # Example page order
reordered_pdf_path = reorder_first_n_pages_with_links(final_output_path, page_order)
print(f"Reordered PDF saved as {reordered_pdf_path}")
os.chmod(reordered_pdf_path, 0o777)
files.download(reordered_pdf_path)
if __name__ == "__main__":
main()