Merge, split, convert, and secure PDFs using lightweight Python libraries – no Adobe required.
Python’s simplicity + powerful libraries like PyPDF4, pdfrw, and PyMuPDF let you:
✅ Process 1000+ PDFs/hour on basic hardware
✅ Avoid $300+/year Adobe Acrobat costs
✅ Integrate with existing workflows (email, cloud, databases)
Real Impact: A SaaS startup automated client report generation, saving $8k/month in manual labor.
No Virtualenv Needed:
pip install pypdf4 pdfrw pikepdf
Test Installation:
import PyPDF4 print(PyPDF4.__version__) # Should return ≥2.0
Keyword: “Batch merge PDFs Python”
import os from PyPDF4 import PdfFileMerger def batch_merge(folder_path, output_name): merger = PdfFileMerger() for file in sorted(os.listdir(folder_path)): if file.endswith(".pdf"): merger.append(os.path.join(folder_path, file)) merger.write(output_name) merger.close() # Usage batch_merge("./invoices", "merged_invoices_Q2.pdf")
Pro Tip: Add numeric sorting for sequenced files:
sorted_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split("_")[1]))
Keyword: “Auto-rotate scanned PDFs”
from pikepdf import Pdf, Page def autorotate_pdfs(input_path, output_path): pdf = Pdf.open(input_path) for page in pdf.pages: if '/Rotate' in page: angle = page.Rotate if angle not in [0, 90, 180, 270]: page.Rotate = 0 # Reset invalid rotations else: page.Rotate = 0 # Force portrait pdf.save(output_path) autorotate_pdfs("scanned_docs.pdf", "fixed_scans.pdf")
Use Case: Correct 10k+ scans from multi-function printers.
Keyword: “Free PDF watermark tool Python”
from PyPDF4 import PdfFileWriter, PdfFileReader def add_watermark(input_pdf, watermark_pdf, output_pdf): watermark = PdfFileReader(watermark_pdf).getPage(0) writer = PdfFileWriter() with open(input_pdf, "rb") as f: pdf = PdfFileReader(f) for i in range(pdf.getNumPages()): page = pdf.getPage(i) page.mergePage(watermark) writer.addPage(page) with open(output_pdf, "wb") as f: writer.write(f) add_watermark("draft.pdf", "confidential_watermark.pdf", "final.pdf")
Advanced: Generate watermarks dynamically with ReportLab:
from reportlab.pdfgen import canvas def create_text_watermark(text, output): c = canvas.Canvas(output) c.setFont("Helvetica", 40) c.setFillGray(0.5) c.rotate(45) c.drawString(100, 100, text) c.save() create_text_watermark("DRAFT", "draft_watermark.pdf")
Keyword: “PDF table extraction Python”
import camelot import pandas as pd def extract_tables_to_excel(pdf_path, output_excel): tables = camelot.read_pdf(pdf_path, flavor="lattice") with pd.ExcelWriter(output_excel) as writer: for i, table in enumerate(tables): table.df.to_excel(writer, sheet_name=f"Table_{i+1}") extract_tables_to_excel("financial_report.pdf", "tables.xlsx")
Optimization: Use stream
flavor for borderless tables:
tables = camelot.read_pdf(pdf_path, flavor="stream", row_tol=15)
Keyword: “Python fill PDF form fields”
import pdfrw def fill_pdf_form(template_path, data_dict, output_path): template = pdfrw.PdfReader(template_path) annotations = template.pages[0].Annots for annotation in annotations: if annotation.T: key = annotation.T[1:-1] # Remove parentheses if key in data_dict: annotation.update(pdfrw.PdfDict(V=data_dict[key])) pdfrw.PdfWriter().write(output_path, template) # Usage data = {"name": "John Doe", "email": "john@example.com"} fill_pdf_form("application.pdf", data, "filled_form.pdf")
Keyword: “Batch encrypt PDFs Python”
import os from PyPDF4 import PdfFileReader, PdfFileWriter def batch_encrypt(folder, password): for file in os.listdir(folder): if file.endswith(".pdf"): reader = PdfFileReader(os.path.join(folder, file)) writer = PdfFileWriter() for page_num in range(reader.numPages): writer.addPage(reader.getPage(page_num)) writer.encrypt(password, use_128bit=True) with open(f"encrypted_{file}", "wb") as f: writer.write(f) batch_encrypt("./sensitive_docs", "SecurePass123!")
Problem: 500+ monthly invoices needing merge → email → archive.
Solution:
import smtplib from email.mime.multipart import MIMEMultipart from email.mime.base import MIMEBase def process_invoices(): # 1. Merge batch_merge("./invoices", "merged_invoices.pdf") # 2. Encrypt batch_encrypt("merged_invoices.pdf", "ClientPass2024") # 3. Email msg = MIMEMultipart() msg.attach(MIMEBase("application", "pdf")) with open("encrypted_merged_invoices.pdf", "rb") as f: msg.attach(MIMEBase("application", "pdf", Name="invoices.pdf")) server = smtplib.SMTP("smtp.gmail.com", 587) server.sendmail("you@company.com", "client@company.com", msg.as_string()) # 4. Archive to S3 import boto3 s3 = boto3.client("s3") s3.upload_file("encrypted_merged_invoices.pdf", "your-bucket", "invoices_2024.pdf")
OCR + Rename Workflow:
import pytesseract from pdf2image import convert_from_path import re def rename_by_content(pdf_path): text = "" pages = convert_from_path(pdf_path, 300) for page in pages: text += pytesseract.image_to_string(page) # Extract invoice number invoice_no = re.search(r"Invoice No: (\d+)", text).group(1) date = re.search(r"Date: (\d{2}-\d{2}-\d{4})", text).group(1) return f"Invoice_{invoice_no}_{date}.pdf" new_name = rename_by_content("scan.pdf") # → Invoice_1234_2024-03-15.pdf
from pikepdf import Pdf def fix_corrupted(input_path, output_path): try: with Pdf.open(input_path) as pdf: pdf.save(output_path) except Exception as e: print(f"Error: {e}. Trying incremental save...") Pdf.open(input_path, allow_overwriting_input=True).save()
import logging logging.basicConfig(filename="pdf_automation.log", level=logging.ERROR) try: batch_merge("./docs", "merged.pdf") except Exception as e: logging.error(f"Merge failed: {e}", exc_info=True)
from concurrent.futures import ThreadPoolExecutor def process_pdf(file): # Your PDF operation here pass with ThreadPoolExecutor() as executor: files = [f for f in os.listdir(".") if f.endswith(".pdf")] executor.map(process_pdf, files)
def process_large_pdf(input_path): with open(input_path, "rb") as f: reader = PdfFileReader(f) for page_num in range(reader.numPages): page = reader.getPage(page_num) # Process one page at a time process_page(page)
Download 25 Ready-to-Use Scripts
Merge/Split
OCR & Renaming
Form Filling
Error Handling
Q: Can I run these without coding skills?
A: Yes! Copy-paste scripts into .py
files and run via Terminal.
Q: How to automate daily?
A: Use cron (Linux/Mac) or Task Scheduler (Windows).
Q: Are there cloud alternatives?
A: Yes, but self-hosted Python is cheaper and more secure.
With these 25 free scripts, you can eliminate repetitive PDF tasks forever. Start with basic merges, then explore OCR and cloud automation.
Next: Secure your automated PDFs
Introduction: Why Kofax ReadSoft Dominates Enterprise Document Processing In today's data-driven business landscape, 90% of organizations…
Working with PDF files on Linux has often posed a unique challenge for professionals. Whether…
Introduction to PDF Utility in System Administration PDFs are an essential part of the workflow…
Removing a PDF password might sound like a minor task, but when time is short…
Introduction: Why You Need a Free PDF Editor Free PDF Editors, PDFs dominate our digital…
Introduction: In 2025, cyber threats are evolving faster than ever—ransomware, AI-powered phishing, and quantum computing…