Monday, August 14, 2023

Mass Convert Tiff to PDF with Python

I needed to convert over 200K tiff files to pdf.  Hope this helps.


//Larry


Version with Comments

# Python3 program to bulk convert image file to pdf
# Mass convert tiff files to pdf
# Larry Billinghurst - 14 Aug 2023
# using img2pdf library

# Note: will overwrite target directory files

# importing necessary libraries
import img2pdf
from PIL import Image
import os
import datetime
import argparse
import time


# Set up the parser and add arguments
parser = argparse.ArgumentParser(description="Convert image files to PDF")
parser.add_argument("--source_dir", default="C:/support/exadocs",
                    help="Directory of the source images")
parser.add_argument("--output_dir", default="C:/support/exadocs-pdf",
                    help="Directory to save the converted PDFs")

# Parse the command line arguments
args = parser.parse_args()

# Start the timer
start_time = time.time()

# Initialize the file counter
file_counter = 0

# Use the arguments
source_dir = args.source_dir
output_dir = args.output_dir

print(f"Using source directory: {source_dir}")
print(f"Saving to output directory: {output_dir}")


# Create a timestamp
current_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
# Path for the log file with timestamp
log_file_path = f"C:/support/exadocs-log_{current_timestamp}.txt"  

# File Extension to look for
file_extension_list = ('.tif','.jpeg')
# Get logfile ready to write if needed
def log_error(message):
    """Append error messages to a log file."""
    with open(log_file_path, 'a') as log_file:
        log_file.write(message + "\n")

# loop through all files in source directory
for target_file in os.listdir(source_dir):
    filename_full = os.path.basename(target_file)
    # Store the filename without extension using the [0]
    filename = os.path.splitext(filename_full)[0]
    # Build the pdf target filename with output directory
    pdf_target_file = output_dir + '/' + filename + '.pdf'
    # Set source filename with path
    source_file = source_dir + '/' + target_file
    print(source_file)
    print(pdf_target_file)
   
    # Check if we have a matching file extension
    if target_file.endswith(file_extension_list):
        try:
            # Open and verify the image
            with Image.open(source_file) as img:
                img.verify()

            file_counter += 1  # Increment the file counter
            # Convert the image to PDF
            # Open pdf target file as "wb" write binary
            with open(pdf_target_file,"wb") as out_file:
                out_file.write(img2pdf.convert(source_file))
        except Exception as e:
            print("Verification or conversion failed for "
                f"{source_file}. Error: {e}")

# Print the number of files processed
print(f"Number of files processed: {file_counter}")

# Calculate and print the elapsed time
end_time = time.time()
elapsed_time = end_time - start_time

hours, remainder = divmod(elapsed_time, 3600)
minutes, seconds = divmod(remainder, 60)

print(f"Script executed in: {int(hours)} hours, "
      f"{int(minutes)} minutes, {seconds:.2f} seconds")


# End of Code




Compact Version


# Python3 program to bulk convert image file to pdf
import argparse
import datetime
import os
import time

from PIL import Image
import img2pdf


def log_error(message):
    with open(log_file_path, 'a') as log_file:
        log_file.write(message + "\n")


parser = argparse.ArgumentParser(description="Convert image files to PDF")
parser.add_argument("--source_dir", default="C:/support/exadocs",
                    help="Directory of source images")
parser.add_argument("--output_dir", default="C:/support/exadocs-pdf",
                    help="Directory for PDFs")

args = parser.parse_args()

start_time = time.time()
file_counter = 0

log_file_path = f"C:/support/exadocs-log_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.txt"

for target_file in os.listdir(args.source_dir):
    if target_file.endswith(('.tif', '.jpeg')):
        source_file = os.path.join(args.source_dir, target_file)
        pdf_target_file = os.path.join(args.output_dir, os.path.splitext(target_file)[0] + '.pdf')
       
        try:
            with Image.open(source_file) as img:
                img.verify()
               
            with open(pdf_target_file, "wb") as out_file:
                out_file.write(img2pdf.convert(source_file))
               
            file_counter += 1
        except Exception as e:
            log_error(f"Failed for {source_file}. Error: {e}")

end_time = time.time()
hours, remainder = divmod(end_time - start_time, 3600)
minutes, seconds = divmod(remainder, 60)

print(f"Processed: {file_counter} files in {int(hours)}h {int(minutes)}m {seconds:.2f}s.")