name	pdf
description	PDF generation and manipulation for compliance reports, schedule printouts, and document extraction. Use when creating printable documents or extracting data from PDFs.

PDF Processing Skill

Comprehensive PDF operations for generating compliance reports, printable schedules, and extracting data from uploaded documents.

When This Skill Activates

Generating printable schedule PDFs
Creating ACGME compliance reports
Extracting data from uploaded PDF documents
Merging or splitting PDF files
Adding watermarks or headers to documents

Required Libraries

# PDF generation
from reportlab.lib.pagesizes import letter, A4
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer

# PDF reading and manipulation
import pypdf
from pypdf import PdfReader, PdfWriter, PdfMerger

# Text and table extraction
import pdfplumber

# OCR for scanned documents (optional)
# import pytesseract
# from pdf2image import convert_from_path

PDF Generation Patterns

Schedule Report

from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer
from reportlab.lib.pagesizes import letter, landscape
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet
from io import BytesIO
from datetime import date

def generate_schedule_pdf(
    schedule_data: dict,
    start_date: date,
    end_date: date
) -> BytesIO:
    """Generate printable schedule PDF."""
    buffer = BytesIO()
    doc = SimpleDocTemplate(
        buffer,
        pagesize=landscape(letter),
        rightMargin=0.5*inch,
        leftMargin=0.5*inch,
        topMargin=0.5*inch,
        bottomMargin=0.5*inch
    )

    styles = getSampleStyleSheet()
    elements = []

    # Title
    title = Paragraph(
        f"<b>Schedule: {start_date.strftime('%B %d')} - {end_date.strftime('%B %d, %Y')}</b>",
        styles['Title']
    )
    elements.append(title)
    elements.append(Spacer(1, 0.25*inch))

    # Build table data
    headers = ['Name', 'Role'] + [d.strftime('%a %m/%d') for d in schedule_data['dates']]
    table_data = [headers]

    for person in schedule_data['assignments']:
        row = [person['name'], person['role']]
        row.extend(person['daily_assignments'])
        table_data.append(row)

    # Create table with styling
    table = Table(table_data, repeatRows=1)
    table.setStyle(TableStyle([
        # Header styling
        ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#366092')),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
        ('FONTSIZE', (0, 0), (-1, 0), 10),
        ('ALIGN', (0, 0), (-1, 0), 'CENTER'),

        # Body styling
        ('FONTNAME', (0, 1), (-1, -1), 'Helvetica'),
        ('FONTSIZE', (0, 1), (-1, -1), 8),
        ('ALIGN', (0, 1), (-1, -1), 'CENTER'),

        # Grid
        ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),

        # Alternating row colors
        ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.HexColor('#F0F0F0')]),
    ]))

    elements.append(table)

    # Footer with generation timestamp
    elements.append(Spacer(1, 0.5*inch))
    footer = Paragraph(
        f"<i>Generated: {date.today().strftime('%Y-%m-%d %H:%M')}</i>",
        styles['Normal']
    )
    elements.append(footer)

    doc.build(elements)
    buffer.seek(0)
    return buffer

ACGME Compliance Report

def generate_compliance_report_pdf(
    compliance_data: dict,
    period_start: date,
    period_end: date
) -> BytesIO:
    """Generate ACGME compliance report PDF."""
    buffer = BytesIO()
    doc = SimpleDocTemplate(buffer, pagesize=letter)

    styles = getSampleStyleSheet()
    elements = []

    # Header
    elements.append(Paragraph(
        "<b>ACGME Compliance Report</b>",
        styles['Title']
    ))
    elements.append(Paragraph(
        f"Period: {period_start.strftime('%B %d, %Y')} - {period_end.strftime('%B %d, %Y')}",
        styles['Normal']
    ))
    elements.append(Spacer(1, 0.25*inch))

    # Summary section
    elements.append(Paragraph("<b>Compliance Summary</b>", styles['Heading2']))

    summary_data = [
        ['Metric', 'Compliant', 'Total', 'Rate'],
        ['80-Hour Rule', str(compliance_data['hours_compliant']),
         str(compliance_data['total_residents']),
         f"{compliance_data['hours_rate']:.1%}"],
        ['1-in-7 Rule', str(compliance_data['day_off_compliant']),
         str(compliance_data['total_residents']),
         f"{compliance_data['day_off_rate']:.1%}"],
        ['Supervision Ratios', str(compliance_data['supervision_compliant']),
         str(compliance_data['total_checks']),
         f"{compliance_data['supervision_rate']:.1%}"],
    ]

    summary_table = Table(summary_data, colWidths=[2*inch, 1*inch, 1*inch, 1*inch])
    summary_table.setStyle(TableStyle([
        ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#366092')),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
        ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
        ('ALIGN', (1, 0), (-1, -1), 'CENTER'),
    ]))

    elements.append(summary_table)
    elements.append(Spacer(1, 0.5*inch))

    # Violations section (if any)
    if compliance_data.get('violations'):
        elements.append(Paragraph("<b>Violations</b>", styles['Heading2']))

        for violation in compliance_data['violations']:
            elements.append(Paragraph(
                f"• <b>{violation['resident']}</b>: {violation['type']} - {violation['details']}",
                styles['Normal']
            ))
        elements.append(Spacer(1, 0.25*inch))

    # Individual resident details
    elements.append(Paragraph("<b>Individual Compliance</b>", styles['Heading2']))

    detail_data = [['Resident', 'PGY', 'Avg Hours', '80hr', 'Days Off', '1-in-7']]
    for resident in compliance_data['residents']:
        detail_data.append([
            resident['name'],
            f"PGY-{resident['pgy_level']}",
            f"{resident['avg_hours']:.1f}",
            '✓' if resident['hours_compliant'] else '✗',
            str(resident['days_off']),
            '✓' if resident['day_off_compliant'] else '✗',
        ])

    detail_table = Table(detail_data, colWidths=[1.5*inch, 0.5*inch, 0.8*inch, 0.5*inch, 0.7*inch, 0.5*inch])
    detail_table.setStyle(TableStyle([
        ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#366092')),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
        ('FONTSIZE', (0, 0), (-1, -1), 9),
        ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
        ('ALIGN', (1, 0), (-1, -1), 'CENTER'),
    ]))

    elements.append(detail_table)

    doc.build(elements)
    buffer.seek(0)
    return buffer

PDF Reading and Extraction

Extract Text from PDF

import pdfplumber

def extract_text_from_pdf(file_path: str) -> str:
    """Extract all text from a PDF file."""
    text_content = []

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                text_content.append(text)

    return "\n\n".join(text_content)

Extract Tables from PDF

import pandas as pd
import pdfplumber

def extract_tables_from_pdf(file_path: str) -> list[pd.DataFrame]:
    """Extract all tables from a PDF as DataFrames."""
    tables = []

    with pdfplumber.open(file_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            page_tables = page.extract_tables()

            for table_idx, table in enumerate(page_tables):
                if table and len(table) > 1:
                    # Use first row as headers
                    df = pd.DataFrame(table[1:], columns=table[0])
                    df.attrs['source'] = f"Page {page_num + 1}, Table {table_idx + 1}"
                    tables.append(df)

    return tables

Import Schedule from PDF

async def import_schedule_from_pdf(
    file_path: str,
    db: AsyncSession,
    schedule_id: str
) -> tuple[list, list]:
    """
    Attempt to import schedule data from PDF.

    Note: PDF parsing is less reliable than Excel.
    Best for structured, table-based PDFs.
    """
    errors = []
    created = []

    tables = extract_tables_from_pdf(file_path)

    if not tables:
        errors.append({'error': 'No tables found in PDF'})
        return created, errors

    # Try to identify schedule table
    for df in tables:
        # Look for date-like columns
        date_cols = [col for col in df.columns if _looks_like_date(col)]

        if not date_cols:
            continue

        # Process as schedule
        # ... similar logic to Excel import
        pass

    return created, errors


def _looks_like_date(value: str) -> bool:
    """Check if value looks like a date header."""
    import re
    date_patterns = [
        r'\d{1,2}/\d{1,2}',  # MM/DD
        r'Mon|Tue|Wed|Thu|Fri|Sat|Sun',  # Day names
        r'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec',  # Month names
    ]
    return any(re.search(p, str(value), re.I) for p in date_patterns)

PDF Manipulation

Merge PDFs

from pypdf import PdfMerger

def merge_pdfs(input_files: list[str], output_path: str) -> None:
    """Merge multiple PDFs into one."""
    merger = PdfMerger()

    for pdf_file in input_files:
        merger.append(pdf_file)

    merger.write(output_path)
    merger.close()

Split PDF

from pypdf import PdfReader, PdfWriter

def split_pdf(input_file: str, output_dir: str) -> list[str]:
    """Split PDF into individual pages."""
    reader = PdfReader(input_file)
    output_files = []

    for i, page in enumerate(reader.pages):
        writer = PdfWriter()
        writer.add_page(page)

        output_path = f"{output_dir}/page_{i+1}.pdf"
        with open(output_path, 'wb') as out_file:
            writer.write(out_file)
        output_files.append(output_path)

    return output_files

Add Watermark

from pypdf import PdfReader, PdfWriter
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from io import BytesIO

def add_watermark(input_file: str, watermark_text: str, output_file: str) -> None:
    """Add text watermark to all pages."""
    # Create watermark
    watermark_buffer = BytesIO()
    c = canvas.Canvas(watermark_buffer, pagesize=letter)
    c.setFont("Helvetica", 50)
    c.setFillColorRGB(0.5, 0.5, 0.5, 0.3)  # Gray, semi-transparent
    c.saveState()
    c.translate(300, 400)
    c.rotate(45)
    c.drawCentredString(0, 0, watermark_text)
    c.restoreState()
    c.save()
    watermark_buffer.seek(0)

    watermark_pdf = PdfReader(watermark_buffer)
    watermark_page = watermark_pdf.pages[0]

    # Apply to each page
    reader = PdfReader(input_file)
    writer = PdfWriter()

    for page in reader.pages:
        page.merge_page(watermark_page)
        writer.add_page(page)

    with open(output_file, 'wb') as out_file:
        writer.write(out_file)

FastAPI Integration

Export Endpoint

from fastapi import APIRouter
from fastapi.responses import StreamingResponse

router = APIRouter()

@router.get("/schedules/{schedule_id}/export/pdf")
async def export_schedule_pdf(
    schedule_id: str,
    db: AsyncSession = Depends(get_db)
):
    """Export schedule as printable PDF."""
    schedule = await get_schedule_with_assignments(db, schedule_id)

    pdf_buffer = generate_schedule_pdf(
        schedule_data=schedule,
        start_date=schedule.start_date,
        end_date=schedule.end_date
    )

    return StreamingResponse(
        pdf_buffer,
        media_type="application/pdf",
        headers={
            "Content-Disposition": f"attachment; filename=schedule_{schedule_id}.pdf"
        }
    )


@router.get("/compliance/report/pdf")
async def export_compliance_report(
    start_date: date,
    end_date: date,
    db: AsyncSession = Depends(get_db)
):
    """Generate ACGME compliance report PDF."""
    compliance_data = await calculate_compliance(db, start_date, end_date)

    pdf_buffer = generate_compliance_report_pdf(
        compliance_data=compliance_data,
        period_start=start_date,
        period_end=end_date
    )

    return StreamingResponse(
        pdf_buffer,
        media_type="application/pdf",
        headers={
            "Content-Disposition": f"attachment; filename=compliance_report_{start_date}_{end_date}.pdf"
        }
    )

Security Considerations

Uploaded PDF Validation

import magic

def validate_pdf_upload(file_content: bytes) -> bool:
    """Validate uploaded file is actually a PDF."""
    # Check magic bytes
    mime = magic.from_buffer(file_content, mime=True)
    if mime != 'application/pdf':
        return False

    # Check file header
    if not file_content.startswith(b'%PDF'):
        return False

    return True

Sanitize Extracted Text

import re

def sanitize_extracted_text(text: str) -> str:
    """Remove potentially dangerous content from extracted text."""
    # Remove control characters
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text)

    # Remove script-like patterns (basic XSS prevention)
    text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.I | re.S)

    return text.strip()

Verification Checklist

Before finalizing any PDF operation:

PDF renders correctly in multiple viewers
Tables fit within page margins
Fonts are embedded (for portability)
File size is reasonable
No sensitive data in metadata
Page orientation matches content

References

ReportLab Documentation
pypdf Documentation
pdfplumber Documentation
Project exports: backend/app/services/exports/

pdf

Install Skill

SKILL.md

PDF Processing Skill

When This Skill Activates

Required Libraries

PDF Generation Patterns

Schedule Report

ACGME Compliance Report

PDF Reading and Extraction

Extract Text from PDF

Extract Tables from PDF

Import Schedule from PDF

PDF Manipulation

Merge PDFs

Split PDF

Add Watermark

FastAPI Integration

Export Endpoint

Security Considerations

Uploaded PDF Validation

Sanitize Extracted Text

Verification Checklist

References