Claude Code Plugins

Community-maintained marketplace

Feedback
0
0

Extract clean article text from web pages. Use when user wants to read article content without ads/clutter, save articles, or process web content.

Install Skill

1Download skill
2Enable skills in Claude

Open claude.ai/settings/capabilities and find the "Skills" section

3Upload to Claude

Click "Upload skill" and select the downloaded ZIP file

Note: Please verify skill by going through its instructions before using it.

SKILL.md

name Article Extractor
description Extract clean article text from web pages. Use when user wants to read article content without ads/clutter, save articles, or process web content.
source base

Article Extractor

Extract clean, readable content from web articles by removing ads, menus, and clutter.

When to Use

Use this skill when the user wants to:

  • Extract main article text from web pages
  • Remove ads, sidebars, and navigation
  • Save articles for offline reading
  • Process article content programmatically
  • Convert web pages to markdown
  • Get article metadata (title, author, date)

Installation

npm install @mozilla/readability jsdom
npm install node-fetch

Or use readability-cli:

npm install -g @mozilla/readability-cli

Python alternative:

pip install newspaper3k
pip install readability-lxml

JavaScript Usage

const fetch = require('node-fetch');
const { Readability } = require('@mozilla/readability');
const { JSDOM } = require('jsdom');

async function extractArticle(url) {
  const response = await fetch(url);
  const html = await response.text();

  const dom = new JSDOM(html, { url });
  const reader = new Readability(dom.window.document);
  const article = reader.parse();

  return {
    title: article.title,
    author: article.byline,
    content: article.textContent,
    html: article.content,
    excerpt: article.excerpt,
    siteName: article.siteName,
    length: article.length
  };
}

const article = await extractArticle('https://example.com/article');
console.log(article.title);
console.log(article.content);

Save to File

const fs = require('fs');

async function saveArticle(url, outputPath) {
  const article = await extractArticle(url);

  const markdown = `# ${article.title}\n\n` +
                   `**Author:** ${article.author || 'Unknown'}\n\n` +
                   `**Source:** ${article.siteName || url}\n\n` +
                   `---\n\n${article.content}`;

  fs.writeFileSync(outputPath, markdown);
  console.log(`Saved to ${outputPath}`);
}

await saveArticle('https://example.com/article', 'article.md');

HTML to Markdown

const TurndownService = require('turndown');

async function articleToMarkdown(url) {
  const article = await extractArticle(url);

  const turndown = new TurndownService({
    headingStyle: 'atx',
    codeBlockStyle: 'fenced'
  });

  const markdown = turndown.turndown(article.html);

  return {
    title: article.title,
    author: article.byline,
    markdown: markdown
  };
}

Batch Extraction

async function extractMultiple(urls) {
  const results = [];

  for (const url of urls) {
    try {
      const article = await extractArticle(url);
      results.push({ url, article, error: null });

      // Rate limiting
      await new Promise(resolve => setTimeout(resolve, 1000));
    } catch (error) {
      results.push({ url, article: null, error: error.message });
    }
  }

  return results;
}

const urls = [
  'https://example.com/article1',
  'https://example.com/article2'
];

const articles = await extractMultiple(urls);

Python Usage (newspaper3k)

from newspaper import Article

url = 'https://example.com/article'
article = Article(url)

article.download()
article.parse()

print('Title:', article.title)
print('Authors:', article.authors)
print('Date:', article.publish_date)
print('Text:', article.text)
print('Top image:', article.top_image)

Python Usage (readability)

import requests
from readability import Document

response = requests.get('https://example.com/article')
doc = Document(response.text)

print('Title:', doc.title())
print('Content:', doc.summary())

CLI Usage

# Extract article
npx @mozilla/readability-cli https://example.com/article

# Save to file
npx @mozilla/readability-cli https://example.com/article > article.html

# Extract text only
npx @mozilla/readability-cli https://example.com/article --text-only

Advanced Patterns

With Full Browser (for JavaScript-heavy sites)

const puppeteer = require('puppeteer');

async function extractWithBrowser(url) {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();

  await page.goto(url, { waitUntil: 'networkidle2' });

  const content = await page.evaluate(() => {
    const { Readability } = require('@mozilla/readability');
    const reader = new Readability(document);
    return reader.parse();
  });

  await browser.close();
  return content;
}

Extract with Images

async function extractWithImages(url) {
  const article = await extractArticle(url);

  // Parse images from HTML content
  const dom = new JSDOM(article.html);
  const images = Array.from(dom.window.document.querySelectorAll('img'))
    .map(img => ({
      src: img.src,
      alt: img.alt
    }));

  return {
    ...article,
    images
  };
}

Extract Metadata

const metascraper = require('metascraper')([
  require('metascraper-author')(),
  require('metascraper-date')(),
  require('metascraper-description')(),
  require('metascraper-image')(),
  require('metascraper-title')()
]);

async function getMetadata(url) {
  const response = await fetch(url);
  const html = await response.text();

  const metadata = await metascraper({ html, url });

  return {
    title: metadata.title,
    author: metadata.author,
    date: metadata.date,
    description: metadata.description,
    image: metadata.image
  };
}

Filter by Content Length

async function extractLongArticle(url, minLength = 1000) {
  const article = await extractArticle(url);

  if (article.content.length < minLength) {
    throw new Error(`Article too short: ${article.content.length} chars`);
  }

  return article;
}

Convert to Different Formats

To Plain Text

function stripHTML(html) {
  const dom = new JSDOM(html);
  return dom.window.document.body.textContent;
}

const article = await extractArticle(url);
const plainText = stripHTML(article.html);

To JSON

async function saveAsJSON(url, outputPath) {
  const article = await extractArticle(url);

  fs.writeFileSync(outputPath, JSON.stringify(article, null, 2));
}

await saveAsJSON('https://example.com/article', 'article.json');

To PDF

const PDFDocument = require('pdfkit');

async function articleToPDF(url, outputPath) {
  const article = await extractArticle(url);

  const doc = new PDFDocument();
  doc.pipe(fs.createWriteStream(outputPath));

  doc.fontSize(20).text(article.title);
  doc.moveDown();
  doc.fontSize(12).text(article.byline || '');
  doc.moveDown();
  doc.fontSize(10).text(article.content);

  doc.end();
}

Common Patterns

RSS Feed Reader

const Parser = require('rss-parser');

async function readFeed(feedUrl) {
  const parser = new Parser();
  const feed = await parser.parseURL(feedUrl);

  const articles = [];

  for (const item of feed.items.slice(0, 5)) {
    const article = await extractArticle(item.link);
    articles.push({
      title: item.title,
      link: item.link,
      content: article.content
    });

    await new Promise(resolve => setTimeout(resolve, 1000));
  }

  return articles;
}

Article Archive

async function archiveArticle(url) {
  const article = await extractArticle(url);
  const timestamp = new Date().toISOString().split('T')[0];
  const filename = `${timestamp}-${article.title.replace(/[^a-z0-9]/gi, '-').toLowerCase()}.md`;

  const markdown = `# ${article.title}\n\n` +
                   `**Date archived:** ${timestamp}\n` +
                   `**URL:** ${url}\n` +
                   `**Author:** ${article.byline || 'Unknown'}\n\n` +
                   `---\n\n${article.content}`;

  fs.writeFileSync(`archive/${filename}`, markdown);
}

Best Practices

  • Respect robots.txt and rate limits
  • Add delays between requests (1-2 seconds)
  • Cache extracted content to avoid re-fetching
  • Handle errors gracefully (some pages won't extract)
  • Use full browser (Puppeteer) for JavaScript-heavy sites
  • Verify content quality after extraction
  • Store original URL with extracted content
  • Consider copyright and terms of service

Common Issues

Empty or incomplete content: Page uses JavaScript to load content

Solution: Use Puppeteer instead of simple fetch

Extraction fails: Non-article page (e.g., home page)

Solution: Check article.length before processing

Images missing or broken: Relative URLs not resolved

Solution: Convert relative URLs to absolute

Rate limited: Too many requests

Solution: Add delays and respect robots.txt

Tools

Command-line tools:

# Readability CLI
npx @mozilla/readability-cli URL

# Save to file
curl URL | npx @mozilla/readability-cli > article.html

# Mercury Parser (alternative)
npx @postlight/mercury-parser URL

Browser extensions:

  • Reader View (Firefox built-in)
  • Just Read (Chrome extension)
  • Clearly (Chrome extension)

Resources