Effortless PDF Automation with Python: 25 Scripts to Save 10+ Hours/Week (2025)

PDF Automation with Python: 25 Scripts

Merge, split, convert, and secure PDFs using lightweight Python libraries – no Adobe required.

Why Python for PDF Automation?

Python’s simplicity + powerful libraries like PyPDF4, pdfrw, and PyMuPDF let you:

✅ Process 1000+ PDFs/hour on basic hardware
✅ Avoid $300+/year Adobe Acrobat costs
✅ Integrate with existing workflows (email, cloud, databases)

Real Impact: A SaaS startup automated client report generation, saving $8k/month in manual labor.

1. Environment Setup (30 Seconds)

No Virtualenv Needed:

pip install pypdf4 pdfrw pikepdf

Test Installation:

import PyPDF4  
print(PyPDF4.__version__)  # Should return ≥2.0

2. Essential PDF Scripts

2.1 Batch Merge 1000s of PDFs

Keyword: “Batch merge PDFs Python”

import os  
from PyPDF4 import PdfFileMerger  

def batch_merge(folder_path, output_name):  
    merger = PdfFileMerger()  
    for file in sorted(os.listdir(folder_path)):  
        if file.endswith(".pdf"):  
            merger.append(os.path.join(folder_path, file))  
    merger.write(output_name)  
    merger.close()  

# Usage  
batch_merge("./invoices", "merged_invoices_Q2.pdf")

Pro Tip: Add numeric sorting for sequenced files:

sorted_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split("_")[1]))

2.2 Auto-Rotate Scanned PDFs

Keyword: “Auto-rotate scanned PDFs”

from pikepdf import Pdf, Page  

def autorotate_pdfs(input_path, output_path):  
    pdf = Pdf.open(input_path)  
    for page in pdf.pages:  
        if '/Rotate' in page:  
            angle = page.Rotate  
            if angle not in [0, 90, 180, 270]:  
                page.Rotate = 0  # Reset invalid rotations  
        else:  
            page.Rotate = 0  # Force portrait  
    pdf.save(output_path)  

autorotate_pdfs("scanned_docs.pdf", "fixed_scans.pdf")

Use Case: Correct 10k+ scans from multi-function printers.

2.3 Add Dynamic Watermarks

Keyword: “Free PDF watermark tool Python”

from PyPDF4 import PdfFileWriter, PdfFileReader  

def add_watermark(input_pdf, watermark_pdf, output_pdf):  
    watermark = PdfFileReader(watermark_pdf).getPage(0)  
    writer = PdfFileWriter()  
    with open(input_pdf, "rb") as f:  
        pdf = PdfFileReader(f)  
        for i in range(pdf.getNumPages()):  
            page = pdf.getPage(i)  
            page.mergePage(watermark)  
            writer.addPage(page)  
    with open(output_pdf, "wb") as f:  
        writer.write(f)  

add_watermark("draft.pdf", "confidential_watermark.pdf", "final.pdf")

Advanced: Generate watermarks dynamically with ReportLab:

from reportlab.pdfgen import canvas  

def create_text_watermark(text, output):  
    c = canvas.Canvas(output)  
    c.setFont("Helvetica", 40)  
    c.setFillGray(0.5)  
    c.rotate(45)  
    c.drawString(100, 100, text)  
    c.save()  

create_text_watermark("DRAFT", "draft_watermark.pdf")

3. Advanced Automation

3.1 Extract Tables to Excel

Keyword: “PDF table extraction Python”

import camelot  
import pandas as pd  

def extract_tables_to_excel(pdf_path, output_excel):  
    tables = camelot.read_pdf(pdf_path, flavor="lattice")  
    with pd.ExcelWriter(output_excel) as writer:  
        for i, table in enumerate(tables):  
            table.df.to_excel(writer, sheet_name=f"Table_{i+1}")  

extract_tables_to_excel("financial_report.pdf", "tables.xlsx")

Optimization: Use stream flavor for borderless tables:

tables = camelot.read_pdf(pdf_path, flavor="stream", row_tol=15)

3.2 Auto-Fill PDF Forms

Keyword: “Python fill PDF form fields”

import pdfrw  

def fill_pdf_form(template_path, data_dict, output_path):  
    template = pdfrw.PdfReader(template_path)  
    annotations = template.pages[0].Annots  
    for annotation in annotations:  
        if annotation.T:  
            key = annotation.T[1:-1]  # Remove parentheses  
            if key in data_dict:  
                annotation.update(pdfrw.PdfDict(V=data_dict[key]))  
    pdfrw.PdfWriter().write(output_path, template)  

# Usage  
data = {"name": "John Doe", "email": "john@example.com"}  
fill_pdf_form("application.pdf", data, "filled_form.pdf")

3.3 Encrypt 1000s of PDFs

Keyword: “Batch encrypt PDFs Python”

import os  
from PyPDF4 import PdfFileReader, PdfFileWriter  

def batch_encrypt(folder, password):  
    for file in os.listdir(folder):  
        if file.endswith(".pdf"):  
            reader = PdfFileReader(os.path.join(folder, file))  
            writer = PdfFileWriter()  
            for page_num in range(reader.numPages):  
                writer.addPage(reader.getPage(page_num))  
            writer.encrypt(password, use_128bit=True)  
            with open(f"encrypted_{file}", "wb") as f:  
                writer.write(f)  

batch_encrypt("./sensitive_docs", "SecurePass123!")

4. Real-World Workflows

4.1 Automated Invoice Processing

Problem: 500+ monthly invoices needing merge → email → archive.
Solution:

import smtplib  
from email.mime.multipart import MIMEMultipart  
from email.mime.base import MIMEBase  

def process_invoices():  
    # 1. Merge  
    batch_merge("./invoices", "merged_invoices.pdf")  
    # 2. Encrypt  
    batch_encrypt("merged_invoices.pdf", "ClientPass2024")  
    # 3. Email  
    msg = MIMEMultipart()  
    msg.attach(MIMEBase("application", "pdf"))  
    with open("encrypted_merged_invoices.pdf", "rb") as f:  
        msg.attach(MIMEBase("application", "pdf", Name="invoices.pdf"))  
    server = smtplib.SMTP("smtp.gmail.com", 587)  
    server.sendmail("you@company.com", "client@company.com", msg.as_string())  
    # 4. Archive to S3  
    import boto3  
    s3 = boto3.client("s3")  
    s3.upload_file("encrypted_merged_invoices.pdf", "your-bucket", "invoices_2024.pdf")

4.2 Auto-Rename Scanned PDFs

OCR + Rename Workflow:

import pytesseract  
from pdf2image import convert_from_path  
import re  

def rename_by_content(pdf_path):  
    text = ""  
    pages = convert_from_path(pdf_path, 300)  
    for page in pages:  
        text += pytesseract.image_to_string(page)  
    # Extract invoice number  
    invoice_no = re.search(r"Invoice No: (\d+)", text).group(1)  
    date = re.search(r"Date: (\d{2}-\d{2}-\d{4})", text).group(1)  
    return f"Invoice_{invoice_no}_{date}.pdf"  

new_name = rename_by_content("scan.pdf")  # → Invoice_1234_2024-03-15.pdf

5. Error Handling & Debugging

5.1 Fix Corrupted PDFs

from pikepdf import Pdf  

def fix_corrupted(input_path, output_path):  
    try:  
        with Pdf.open(input_path) as pdf:  
            pdf.save(output_path)  
    except Exception as e:  
        print(f"Error: {e}. Trying incremental save...")  
        Pdf.open(input_path, allow_overwriting_input=True).save()

5.2 Logging for Batch Jobs

import logging  

logging.basicConfig(filename="pdf_automation.log", level=logging.ERROR)  

try:  
    batch_merge("./docs", "merged.pdf")  
except Exception as e:  
    logging.error(f"Merge failed: {e}", exc_info=True)

6. Performance Optimization

6.1 Parallel Processing

from concurrent.futures import ThreadPoolExecutor  

def process_pdf(file):  
    # Your PDF operation here  
    pass  

with ThreadPoolExecutor() as executor:  
    files = [f for f in os.listdir(".") if f.endswith(".pdf")]  
    executor.map(process_pdf, files)

6.2 Memory Management

def process_large_pdf(input_path):  
    with open(input_path, "rb") as f:  
        reader = PdfFileReader(f)  
        for page_num in range(reader.numPages):  
            page = reader.getPage(page_num)  
            # Process one page at a time  
            process_page(page)