name	PDF
description	PDF manipulation toolkit. Use when user wants to extract text, merge/split PDFs, create PDFs, extract images, or manipulate PDF documents.
source	base

PDF

Comprehensive PDF manipulation and extraction toolkit.

When to Use

Use this skill when the user wants to:

Extract text from PDFs
Merge or split PDF files
Create PDFs from scratch or HTML
Extract images from PDFs
Add or remove pages
Encrypt or decrypt PDFs
Extract metadata and tables
Convert PDFs to other formats

Installation

npm install pdf-lib pdf-parse

For advanced features:

npm install pdfkit    # Create PDFs
npm install puppeteer # HTML to PDF

Python alternative:

pip install PyPDF2 pdfplumber

Extract Text (JavaScript)

const fs = require('fs');
const pdf = require('pdf-parse');

const dataBuffer = fs.readFileSync('document.pdf');
const data = await pdf(dataBuffer);

console.log('Pages:', data.numpages);
console.log('Text:', data.text);

Merge PDFs

const { PDFDocument } = require('pdf-lib');
const fs = require('fs');

async function mergePDFs(pdfPaths, outputPath) {
  const mergedPdf = await PDFDocument.create();

  for (const path of pdfPaths) {
    const pdfBytes = fs.readFileSync(path);
    const pdf = await PDFDocument.load(pdfBytes);
    const pages = await mergedPdf.copyPages(pdf, pdf.getPageIndices());
    pages.forEach(page => mergedPdf.addPage(page));
  }

  const mergedBytes = await mergedPdf.save();
  fs.writeFileSync(outputPath, mergedBytes);
}

await mergePDFs(['file1.pdf', 'file2.pdf'], 'merged.pdf');

Split PDF

async function splitPDF(inputPath, outputDir) {
  const pdfBytes = fs.readFileSync(inputPath);
  const pdf = await PDFDocument.load(pdfBytes);

  for (let i = 0; i < pdf.getPageCount(); i++) {
    const singlePagePdf = await PDFDocument.create();
    const [page] = await singlePagePdf.copyPages(pdf, [i]);
    singlePagePdf.addPage(page);

    const bytes = await singlePagePdf.save();
    fs.writeFileSync(`${outputDir}/page-${i + 1}.pdf`, bytes);
  }
}

await splitPDF('document.pdf', './pages');

Create PDF

const PDFDocument = require('pdfkit');

function createPDF(outputPath) {
  const doc = new PDFDocument();
  const stream = fs.createWriteStream(outputPath);
  doc.pipe(stream);

  // Add content
  doc.fontSize(25).text('Hello World!', 100, 100);

  doc.addPage()
     .fontSize(12)
     .text('Page 2 content');

  doc.end();
}

createPDF('output.pdf');

HTML to PDF

const puppeteer = require('puppeteer');

async function htmlToPdf(htmlContent, outputPath) {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();

  await page.setContent(htmlContent);
  await page.pdf({
    path: outputPath,
    format: 'A4',
    printBackground: true
  });

  await browser.close();
}

await htmlToPdf('<h1>Hello PDF</h1>', 'output.pdf');

Extract Metadata

const { PDFDocument } = require('pdf-lib');

async function getMetadata(pdfPath) {
  const pdfBytes = fs.readFileSync(pdfPath);
  const pdf = await PDFDocument.load(pdfBytes);

  return {
    title: pdf.getTitle(),
    author: pdf.getAuthor(),
    subject: pdf.getSubject(),
    keywords: pdf.getKeywords(),
    creator: pdf.getCreator(),
    producer: pdf.getProducer(),
    creationDate: pdf.getCreationDate(),
    modificationDate: pdf.getModificationDate(),
    pageCount: pdf.getPageCount()
  };
}

Add Pages from Another PDF

async function addPages(sourcePath, targetPath, outputPath) {
  const targetBytes = fs.readFileSync(targetPath);
  const targetPdf = await PDFDocument.load(targetBytes);

  const sourceBytes = fs.readFileSync(sourcePath);
  const sourcePdf = await PDFDocument.load(sourceBytes);

  const pages = await targetPdf.copyPages(sourcePdf, sourcePdf.getPageIndices());
  pages.forEach(page => targetPdf.addPage(page));

  const bytes = await targetPdf.save();
  fs.writeFileSync(outputPath, bytes);
}

Remove Pages

async function removePages(inputPath, outputPath, pagesToRemove) {
  const pdfBytes = fs.readFileSync(inputPath);
  const pdf = await PDFDocument.load(pdfBytes);

  // Remove in reverse order to maintain indices
  pagesToRemove.sort((a, b) => b - a);
  for (const pageNum of pagesToRemove) {
    pdf.removePage(pageNum - 1); // 0-indexed
  }

  const bytes = await pdf.save();
  fs.writeFileSync(outputPath, bytes);
}

await removePages('input.pdf', 'output.pdf', [3, 5, 7]);

Python Usage

Extract Text

import PyPDF2

with open('document.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    text = ''
    for page in reader.pages:
        text += page.extract_text()
    print(text)

Merge PDFs

import PyPDF2

merger = PyPDF2.PdfMerger()

for pdf in ['file1.pdf', 'file2.pdf']:
    merger.append(pdf)

merger.write('merged.pdf')
merger.close()

Extract Tables with pdfplumber

import pdfplumber

with pdfplumber.open('document.pdf') as pdf:
    for page in pdf.pages:
        tables = page.extract_tables()
        for table in tables:
            print(table)

Extract Images

// Using pdf-lib
async function extractImages(pdfPath) {
  const pdfBytes = fs.readFileSync(pdfPath);
  const pdf = await PDFDocument.load(pdfBytes);

  const pages = pdf.getPages();

  for (let i = 0; i < pages.length; i++) {
    const page = pages[i];
    const { width, height } = page.getSize();

    // Images are embedded as XObjects
    const resources = page.node.Resources();
    if (resources) {
      // Extract image data
      console.log(`Page ${i + 1}: ${width}x${height}`);
    }
  }
}

Encrypt PDF

const { PDFDocument, StandardFonts } = require('pdf-lib');

async function encryptPDF(inputPath, outputPath, password) {
  const pdfBytes = fs.readFileSync(inputPath);
  const pdf = await PDFDocument.load(pdfBytes);

  // Note: pdf-lib doesn't support encryption directly
  // Use Python's PyPDF2 or external tools like pdftk
  console.warn('Use Python PyPDF2 or pdftk for encryption');
}

Python encryption:

import PyPDF2

reader = PyPDF2.PdfReader('input.pdf')
writer = PyPDF2.PdfWriter()

for page in reader.pages:
    writer.add_page(page)

writer.encrypt('password123')

with open('encrypted.pdf', 'wb') as output:
    writer.write(output)

Common Patterns

Extract Specific Pages

async function extractPages(inputPath, outputPath, pageNumbers) {
  const pdfBytes = fs.readFileSync(inputPath);
  const sourcePdf = await PDFDocument.load(pdfBytes);
  const targetPdf = await PDFDocument.create();

  const pages = await targetPdf.copyPages(
    sourcePdf,
    pageNumbers.map(n => n - 1) // Convert to 0-indexed
  );

  pages.forEach(page => targetPdf.addPage(page));

  const bytes = await targetPdf.save();
  fs.writeFileSync(outputPath, bytes);
}

await extractPages('input.pdf', 'output.pdf', [1, 3, 5]);

Rotate Pages

async function rotatePages(inputPath, outputPath, degrees) {
  const pdfBytes = fs.readFileSync(inputPath);
  const pdf = await PDFDocument.load(pdfBytes);

  const pages = pdf.getPages();
  pages.forEach(page => {
    page.setRotation({ angle: degrees });
  });

  const bytes = await pdf.save();
  fs.writeFileSync(outputPath, bytes);
}

await rotatePages('input.pdf', 'output.pdf', 90);

Add Watermark

async function addWatermark(inputPath, outputPath, watermarkText) {
  const pdfBytes = fs.readFileSync(inputPath);
  const pdf = await PDFDocument.load(pdfBytes);

  const pages = pdf.getPages();
  const font = await pdf.embedFont(StandardFonts.Helvetica);

  pages.forEach(page => {
    const { width, height } = page.getSize();
    page.drawText(watermarkText, {
      x: width / 2 - 100,
      y: height / 2,
      size: 50,
      font,
      color: rgb(0.95, 0.95, 0.95),
      opacity: 0.5
    });
  });

  const bytes = await pdf.save();
  fs.writeFileSync(outputPath, bytes);
}

Best Practices

Stream large PDFs to avoid memory issues
Validate PDF integrity before processing
Use pdf-lib for structure manipulation
Use pdf-parse for text extraction
Use pdfkit for PDF creation
Use puppeteer for HTML to PDF conversion
Handle encrypted PDFs with proper error messages
Test with various PDF versions (1.4, 1.5, 1.7, etc.)

Common Issues

Text extraction returns gibberish: PDF uses embedded fonts or images

Solution: Use OCR (tesseract.js) for scanned PDFs

Memory errors with large PDFs: File too large for memory

Solution: Process page by page or use streaming

Corrupted output: Incompatible PDF version

Solution: Update libraries or use pdf-lib

Tools

Command-line alternatives:

# Merge PDFs
pdftk file1.pdf file2.pdf cat output merged.pdf

# Split PDF
pdftk input.pdf burst

# Extract pages
pdftk input.pdf cat 1-3 5 7-9 output selected.pdf

# Rotate
pdftk input.pdf cat 1-endeast output rotated.pdf

Resources

pdf-lib: https://pdf-lib.js.org
pdf-parse: https://www.npmjs.com/package/pdf-parse
pdfkit: https://pdfkit.org
PyPDF2: https://pypdf2.readthedocs.io
pdfplumber: https://github.com/jsvine/pdfplumber

PDF

Install Skill

SKILL.md

PDF

When to Use

Installation

Extract Text (JavaScript)

Merge PDFs

Split PDF

Create PDF

HTML to PDF

Extract Metadata

Add Pages from Another PDF

Remove Pages

Python Usage

Extract Text

Merge PDFs

Extract Tables with pdfplumber

Extract Images

Encrypt PDF

Common Patterns

Extract Specific Pages

Rotate Pages

Add Watermark

Best Practices

Common Issues

Tools

Resources