PDF Automation with Python: 25 Scripts
Merge, split, convert, and secure PDFs using lightweight Python libraries – no Adobe required.
Why Python for PDF Automation?
Python’s simplicity + powerful libraries like PyPDF4, pdfrw, and PyMuPDF let you:
-
✅ Process 1000+ PDFs/hour on basic hardware
-
✅ Avoid $300+/year Adobe Acrobat costs
-
✅ Integrate with existing workflows (email, cloud, databases)
Real Impact: A SaaS startup automated client report generation, saving $8k/month in manual labor.
1. Environment Setup (30 Seconds)
No Virtualenv Needed:
pip install pypdf4 pdfrw pikepdf
Test Installation:
import PyPDF4 print(PyPDF4.__version__) # Should return ≥2.0
2. Essential PDF Scripts
2.1 Batch Merge 1000s of PDFs
Keyword: “Batch merge PDFs Python”
import os from PyPDF4 import PdfFileMerger def batch_merge(folder_path, output_name): merger = PdfFileMerger() for file in sorted(os.listdir(folder_path)): if file.endswith(".pdf"): merger.append(os.path.join(folder_path, file)) merger.write(output_name) merger.close() # Usage batch_merge("./invoices", "merged_invoices_Q2.pdf")
Pro Tip: Add numeric sorting for sequenced files:
sorted_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split("_")[1]))
2.2 Auto-Rotate Scanned PDFs
Keyword: “Auto-rotate scanned PDFs”
from pikepdf import Pdf, Page def autorotate_pdfs(input_path, output_path): pdf = Pdf.open(input_path) for page in pdf.pages: if '/Rotate' in page: angle = page.Rotate if angle not in [0, 90, 180, 270]: page.Rotate = 0 # Reset invalid rotations else: page.Rotate = 0 # Force portrait pdf.save(output_path) autorotate_pdfs("scanned_docs.pdf", "fixed_scans.pdf")
Use Case: Correct 10k+ scans from multi-function printers.
2.3 Add Dynamic Watermarks
Keyword: “Free PDF watermark tool Python”
from PyPDF4 import PdfFileWriter, PdfFileReader def add_watermark(input_pdf, watermark_pdf, output_pdf): watermark = PdfFileReader(watermark_pdf).getPage(0) writer = PdfFileWriter() with open(input_pdf, "rb") as f: pdf = PdfFileReader(f) for i in range(pdf.getNumPages()): page = pdf.getPage(i) page.mergePage(watermark) writer.addPage(page) with open(output_pdf, "wb") as f: writer.write(f) add_watermark("draft.pdf", "confidential_watermark.pdf", "final.pdf")
Advanced: Generate watermarks dynamically with ReportLab:
from reportlab.pdfgen import canvas def create_text_watermark(text, output): c = canvas.Canvas(output) c.setFont("Helvetica", 40) c.setFillGray(0.5) c.rotate(45) c.drawString(100, 100, text) c.save() create_text_watermark("DRAFT", "draft_watermark.pdf")
3. Advanced Automation
3.1 Extract Tables to Excel
Keyword: “PDF table extraction Python”
import camelot import pandas as pd def extract_tables_to_excel(pdf_path, output_excel): tables = camelot.read_pdf(pdf_path, flavor="lattice") with pd.ExcelWriter(output_excel) as writer: for i, table in enumerate(tables): table.df.to_excel(writer, sheet_name=f"Table_{i+1}") extract_tables_to_excel("financial_report.pdf", "tables.xlsx")
Optimization: Use stream
flavor for borderless tables:
tables = camelot.read_pdf(pdf_path, flavor="stream", row_tol=15)
3.2 Auto-Fill PDF Forms
Keyword: “Python fill PDF form fields”
import pdfrw def fill_pdf_form(template_path, data_dict, output_path): template = pdfrw.PdfReader(template_path) annotations = template.pages[0].Annots for annotation in annotations: if annotation.T: key = annotation.T[1:-1] # Remove parentheses if key in data_dict: annotation.update(pdfrw.PdfDict(V=data_dict[key])) pdfrw.PdfWriter().write(output_path, template) # Usage data = {"name": "John Doe", "email": "john@example.com"} fill_pdf_form("application.pdf", data, "filled_form.pdf")
3.3 Encrypt 1000s of PDFs
Keyword: “Batch encrypt PDFs Python”
import os from PyPDF4 import PdfFileReader, PdfFileWriter def batch_encrypt(folder, password): for file in os.listdir(folder): if file.endswith(".pdf"): reader = PdfFileReader(os.path.join(folder, file)) writer = PdfFileWriter() for page_num in range(reader.numPages): writer.addPage(reader.getPage(page_num)) writer.encrypt(password, use_128bit=True) with open(f"encrypted_{file}", "wb") as f: writer.write(f) batch_encrypt("./sensitive_docs", "SecurePass123!")
4. Real-World Workflows
4.1 Automated Invoice Processing
Problem: 500+ monthly invoices needing merge → email → archive.
Solution:
import smtplib from email.mime.multipart import MIMEMultipart from email.mime.base import MIMEBase def process_invoices(): # 1. Merge batch_merge("./invoices", "merged_invoices.pdf") # 2. Encrypt batch_encrypt("merged_invoices.pdf", "ClientPass2024") # 3. Email msg = MIMEMultipart() msg.attach(MIMEBase("application", "pdf")) with open("encrypted_merged_invoices.pdf", "rb") as f: msg.attach(MIMEBase("application", "pdf", Name="invoices.pdf")) server = smtplib.SMTP("smtp.gmail.com", 587) server.sendmail("you@company.com", "client@company.com", msg.as_string()) # 4. Archive to S3 import boto3 s3 = boto3.client("s3") s3.upload_file("encrypted_merged_invoices.pdf", "your-bucket", "invoices_2024.pdf")
4.2 Auto-Rename Scanned PDFs
OCR + Rename Workflow:
import pytesseract from pdf2image import convert_from_path import re def rename_by_content(pdf_path): text = "" pages = convert_from_path(pdf_path, 300) for page in pages: text += pytesseract.image_to_string(page) # Extract invoice number invoice_no = re.search(r"Invoice No: (\d+)", text).group(1) date = re.search(r"Date: (\d{2}-\d{2}-\d{4})", text).group(1) return f"Invoice_{invoice_no}_{date}.pdf" new_name = rename_by_content("scan.pdf") # → Invoice_1234_2024-03-15.pdf
5. Error Handling & Debugging
5.1 Fix Corrupted PDFs
from pikepdf import Pdf def fix_corrupted(input_path, output_path): try: with Pdf.open(input_path) as pdf: pdf.save(output_path) except Exception as e: print(f"Error: {e}. Trying incremental save...") Pdf.open(input_path, allow_overwriting_input=True).save()
5.2 Logging for Batch Jobs
import logging logging.basicConfig(filename="pdf_automation.log", level=logging.ERROR) try: batch_merge("./docs", "merged.pdf") except Exception as e: logging.error(f"Merge failed: {e}", exc_info=True)
6. Performance Optimization
6.1 Parallel Processing
from concurrent.futures import ThreadPoolExecutor def process_pdf(file): # Your PDF operation here pass with ThreadPoolExecutor() as executor: files = [f for f in os.listdir(".") if f.endswith(".pdf")] executor.map(process_pdf, files)
6.2 Memory Management
def process_large_pdf(input_path): with open(input_path, "rb") as f: reader = PdfFileReader(f) for page_num in range(reader.numPages): page = reader.getPage(page_num) # Process one page at a time process_page(page)
Free PDF Automation Toolkit
Download 25 Ready-to-Use Scripts
-
Merge/Split
-
OCR & Renaming
-
Form Filling
-
Error Handling
FAQ
Q: Can I run these without coding skills?
A: Yes! Copy-paste scripts into .py
files and run via Terminal.
Q: How to automate daily?
A: Use cron (Linux/Mac) or Task Scheduler (Windows).
Q: Are there cloud alternatives?
A: Yes, but self-hosted Python is cheaper and more secure.
Conclusion
With these 25 free scripts, you can eliminate repetitive PDF tasks forever. Start with basic merges, then explore OCR and cloud automation.
Next: Secure your automated PDFs
Leave a Comment