Merge, split, convert, and secure PDFs using lightweight Python libraries – no Adobe required.
Python’s simplicity + powerful libraries like PyPDF4, pdfrw, and PyMuPDF let you:
✅ Process 1000+ PDFs/hour on basic hardware
✅ Avoid $300+/year Adobe Acrobat costs
✅ Integrate with existing workflows (email, cloud, databases)
Real Impact: A SaaS startup automated client report generation, saving $8k/month in manual labor.
No Virtualenv Needed:
pip install pypdf4 pdfrw pikepdf
Test Installation:
import PyPDF4 print(PyPDF4.__version__) # Should return ≥2.0
Keyword: “Batch merge PDFs Python”
import os from PyPDF4 import PdfFileMerger def batch_merge(folder_path, output_name): merger = PdfFileMerger() for file in sorted(os.listdir(folder_path)): if file.endswith(".pdf"): merger.append(os.path.join(folder_path, file)) merger.write(output_name) merger.close() # Usage batch_merge("./invoices", "merged_invoices_Q2.pdf")
Pro Tip: Add numeric sorting for sequenced files:
sorted_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split("_")[1]))
Keyword: “Auto-rotate scanned PDFs”
from pikepdf import Pdf, Page def autorotate_pdfs(input_path, output_path): pdf = Pdf.open(input_path) for page in pdf.pages: if '/Rotate' in page: angle = page.Rotate if angle not in [0, 90, 180, 270]: page.Rotate = 0 # Reset invalid rotations else: page.Rotate = 0 # Force portrait pdf.save(output_path) autorotate_pdfs("scanned_docs.pdf", "fixed_scans.pdf")
Use Case: Correct 10k+ scans from multi-function printers.
Keyword: “Free PDF watermark tool Python”
from PyPDF4 import PdfFileWriter, PdfFileReader def add_watermark(input_pdf, watermark_pdf, output_pdf): watermark = PdfFileReader(watermark_pdf).getPage(0) writer = PdfFileWriter() with open(input_pdf, "rb") as f: pdf = PdfFileReader(f) for i in range(pdf.getNumPages()): page = pdf.getPage(i) page.mergePage(watermark) writer.addPage(page) with open(output_pdf, "wb") as f: writer.write(f) add_watermark("draft.pdf", "confidential_watermark.pdf", "final.pdf")
Advanced: Generate watermarks dynamically with ReportLab:
from reportlab.pdfgen import canvas def create_text_watermark(text, output): c = canvas.Canvas(output) c.setFont("Helvetica", 40) c.setFillGray(0.5) c.rotate(45) c.drawString(100, 100, text) c.save() create_text_watermark("DRAFT", "draft_watermark.pdf")
Keyword: “PDF table extraction Python”
import camelot import pandas as pd def extract_tables_to_excel(pdf_path, output_excel): tables = camelot.read_pdf(pdf_path, flavor="lattice") with pd.ExcelWriter(output_excel) as writer: for i, table in enumerate(tables): table.df.to_excel(writer, sheet_name=f"Table_{i+1}") extract_tables_to_excel("financial_report.pdf", "tables.xlsx")
Optimization: Use stream
flavor for borderless tables:
tables = camelot.read_pdf(pdf_path, flavor="stream", row_tol=15)
Keyword: “Python fill PDF form fields”
import pdfrw def fill_pdf_form(template_path, data_dict, output_path): template = pdfrw.PdfReader(template_path) annotations = template.pages[0].Annots for annotation in annotations: if annotation.T: key = annotation.T[1:-1] # Remove parentheses if key in data_dict: annotation.update(pdfrw.PdfDict(V=data_dict[key])) pdfrw.PdfWriter().write(output_path, template) # Usage data = {"name": "John Doe", "email": "john@example.com"} fill_pdf_form("application.pdf", data, "filled_form.pdf")
Keyword: “Batch encrypt PDFs Python”
import os from PyPDF4 import PdfFileReader, PdfFileWriter def batch_encrypt(folder, password): for file in os.listdir(folder): if file.endswith(".pdf"): reader = PdfFileReader(os.path.join(folder, file)) writer = PdfFileWriter() for page_num in range(reader.numPages): writer.addPage(reader.getPage(page_num)) writer.encrypt(password, use_128bit=True) with open(f"encrypted_{file}", "wb") as f: writer.write(f) batch_encrypt("./sensitive_docs", "SecurePass123!")
Problem: 500+ monthly invoices needing merge → email → archive.
Solution:
import smtplib from email.mime.multipart import MIMEMultipart from email.mime.base import MIMEBase def process_invoices(): # 1. Merge batch_merge("./invoices", "merged_invoices.pdf") # 2. Encrypt batch_encrypt("merged_invoices.pdf", "ClientPass2024") # 3. Email msg = MIMEMultipart() msg.attach(MIMEBase("application", "pdf")) with open("encrypted_merged_invoices.pdf", "rb") as f: msg.attach(MIMEBase("application", "pdf", Name="invoices.pdf")) server = smtplib.SMTP("smtp.gmail.com", 587) server.sendmail("you@company.com", "client@company.com", msg.as_string()) # 4. Archive to S3 import boto3 s3 = boto3.client("s3") s3.upload_file("encrypted_merged_invoices.pdf", "your-bucket", "invoices_2024.pdf")
OCR + Rename Workflow:
import pytesseract from pdf2image import convert_from_path import re def rename_by_content(pdf_path): text = "" pages = convert_from_path(pdf_path, 300) for page in pages: text += pytesseract.image_to_string(page) # Extract invoice number invoice_no = re.search(r"Invoice No: (\d+)", text).group(1) date = re.search(r"Date: (\d{2}-\d{2}-\d{4})", text).group(1) return f"Invoice_{invoice_no}_{date}.pdf" new_name = rename_by_content("scan.pdf") # → Invoice_1234_2024-03-15.pdf
from pikepdf import Pdf def fix_corrupted(input_path, output_path): try: with Pdf.open(input_path) as pdf: pdf.save(output_path) except Exception as e: print(f"Error: {e}. Trying incremental save...") Pdf.open(input_path, allow_overwriting_input=True).save()
import logging logging.basicConfig(filename="pdf_automation.log", level=logging.ERROR) try: batch_merge("./docs", "merged.pdf") except Exception as e: logging.error(f"Merge failed: {e}", exc_info=True)
from concurrent.futures import ThreadPoolExecutor def process_pdf(file): # Your PDF operation here pass with ThreadPoolExecutor() as executor: files = [f for f in os.listdir(".") if f.endswith(".pdf")] executor.map(process_pdf, files)
def process_large_pdf(input_path): with open(input_path, "rb") as f: reader = PdfFileReader(f) for page_num in range(reader.numPages): page = reader.getPage(page_num) # Process one page at a time process_page(page)
Download 25 Ready-to-Use Scripts
Merge/Split
OCR & Renaming
Form Filling
Error Handling
Q: Can I run these without coding skills?
A: Yes! Copy-paste scripts into .py
files and run via Terminal.
Q: How to automate daily?
A: Use cron (Linux/Mac) or Task Scheduler (Windows).
Q: Are there cloud alternatives?
A: Yes, but self-hosted Python is cheaper and more secure.
With these 25 free scripts, you can eliminate repetitive PDF tasks forever. Start with basic merges, then explore OCR and cloud automation.
Next: Secure your automated PDFs
Introduction: How to Fill Documents on iPhone: No Computer Needed Your iPhone isn’t just a…
Introduction Mastering PDFBox Accessibility with Apache PDFBox In today’s digital landscape, PDFBOX accessibility isn’t optional—it’s a…
How to Convert PDF to Excel Using Python: Revolutionize Your Data Workflows Every day, businesses…
Table of Contents Introduction to A Long Walk to Water Detailed Summary of A Long…
Introduction: The Rise of Browser-Based PDF Editing In 2025, free online PDF editors have revolutionized document workflows.…
Introduction: Why Kofax ReadSoft Dominates Enterprise Document Processing In today's data-driven business landscape, 90% of organizations…