| name | knowledge-ingestion-patterns |
| description | Patterns for ingesting knowledge into vector databases and RAG systems |
Knowledge Ingestion Patterns Skill
Systematic approaches for ingesting different content types into RAG with optimal chunking, metadata, and retrieval quality.
Overview
Different content types require different ingestion strategies. This skill documents best practices for:
- Websites and web content
- PDF documents
- Code repositories
- Conversation exports
- Research notes
- API documentation
Core Principles
- Chunk for retrieval - Optimize chunk size for the questions you'll ask
- Metadata matters - Rich metadata enables filtered search
- Preserve context - Don't lose meaning when splitting
- Deduplicate - Avoid ingesting the same content twice
Content Type Patterns
Pattern 2: PDF Documents
When to use: Research papers, reports, ebooks, scanned documents
Chunking Strategy: Page-aware with overlap, handle tables/figures specially
import fitz # PyMuPDF
from typing import List, Dict
def chunk_pdf(pdf_path: str, chunk_size: int = 500) -> List[Dict]:
"""Extract and chunk PDF content with page awareness."""
doc = fitz.open(pdf_path)
chunks = []
for page_num, page in enumerate(doc, 1):
text = page.get_text()
# Skip empty pages
if not text.strip():
continue
# Split into paragraphs
paragraphs = text.split('
')
current_chunk = ""
for para in paragraphs:
para = para.strip()
if not para:
continue
if len(current_chunk) + len(para) < chunk_size:
current_chunk += " " + para
else:
if current_chunk:
chunks.append({
"content": current_chunk.strip(),
"metadata": {
"type": "pdf",
"source": pdf_path,
"page": page_num,
"total_pages": len(doc)
}
})
current_chunk = para
# Don't forget last chunk of page
if current_chunk:
chunks.append({
"content": current_chunk.strip(),
"metadata": {
"type": "pdf",
"source": pdf_path,
"page": page_num,
"total_pages": len(doc)
}
})
return chunks
def extract_pdf_tables(pdf_path: str) -> List[Dict]:
"""Extract tables from PDF as separate chunks."""
import pdfplumber
tables = []
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
for table_num, table in enumerate(page.extract_tables(), 1):
# Convert table to markdown format
if table:
headers = table[0]
rows = table[1:]
md_table = "| " + " | ".join(str(h) for h in headers) + " |
"
md_table += "| " + " | ".join("---" for _ in headers) + " |
"
for row in rows:
md_table += "| " + " | ".join(str(c) for c in row) + " |
"
tables.append({
"content": md_table,
"metadata": {
"type": "pdf_table",
"source": pdf_path,
"page": page_num,
"table_number": table_num
}
})
return tables
Metadata Schema:
type: pdf | pdf_table
source: file path
page: page number
total_pages: document length
table_number: (for tables) which table on page
Pattern 4: Websites / Web Content
When to use: Documentation sites, articles, blog posts
Chunking Strategy: Clean HTML, respect structure, handle navigation
import httpx
from bs4 import BeautifulSoup
from typing import List, Dict
from urllib.parse import urljoin, urlparse
def chunk_webpage(url: str) -> List[Dict]:
"""Fetch and chunk a webpage."""
response = httpx.get(url, follow_redirects=True)
soup = BeautifulSoup(response.text, 'html.parser')
# Remove noise
for tag in soup.find_all(['nav', 'footer', 'aside', 'script', 'style']):
tag.decompose()
chunks = []
# Find main content
main = soup.find('main') or soup.find('article') or soup.find('body')
# Chunk by sections
for section in main.find_all(['section', 'div'], class_=lambda x: x and 'content' in str(x).lower()):
text = section.get_text(separator=' ', strip=True)
if len(text) > 100: # Skip tiny sections
chunks.append({
"content": text,
"metadata": {
"type": "webpage",
"source": url,
"domain": urlparse(url).netloc,
"title": soup.title.string if soup.title else ""
}
})
# If no sections found, chunk the whole page
if not chunks:
text = main.get_text(separator=' ', strip=True)
# Split into ~500 word chunks
words = text.split()
for i in range(0, len(words), 450):
chunk_text = ' '.join(words[i:i+500])
chunks.append({
"content": chunk_text,
"metadata": {
"type": "webpage",
"source": url,
"domain": urlparse(url).netloc,
"title": soup.title.string if soup.title else ""
}
})
return chunks
async def crawl_site(start_url: str, max_pages: int = 50) -> List[Dict]:
"""Crawl a site and chunk all pages."""
from urllib.parse import urlparse
base_domain = urlparse(start_url).netloc
visited = set()
to_visit = [start_url]
all_chunks = []
async with httpx.AsyncClient() as client:
while to_visit and len(visited) < max_pages:
url = to_visit.pop(0)
if url in visited:
continue
try:
response = await client.get(url, follow_redirects=True)
visited.add(url)
# Chunk this page
all_chunks.extend(chunk_webpage(url))
# Find links to follow
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a', href=True):
href = urljoin(url, link['href'])
if urlparse(href).netloc == base_domain and href not in visited:
to_visit.append(href)
except Exception as e:
print(f"Failed to fetch {url}: {e}")
return all_chunks
Metadata Schema:
type: webpage
source: full URL
domain: domain name
title: page title
crawl_depth: (for crawls) how many links from start
Pattern 6: Research Notes
When to use: Personal notes, research findings, learnings
Chunking Strategy: By paragraph with topic extraction
from typing import List, Dict
from datetime import datetime
def chunk_research_notes(content: str, topic: str = None) -> List[Dict]:
"""Chunk research notes with topic awareness."""
# Split by double newlines (paragraphs)
paragraphs = [p.strip() for p in content.split('
') if p.strip()]
chunks = []
current_topic = topic or "general"
for para in paragraphs:
# Check if this is a topic header
if para.startswith('#') or (len(para) < 50 and para.endswith(':')):
current_topic = para.strip('#: ')
continue
chunks.append({
"content": para,
"metadata": {
"type": "research",
"topic": current_topic,
"ingested_at": datetime.now().isoformat(),
"word_count": len(para.split())
}
})
return chunks
def chunk_with_source_attribution(
content: str,
source_url: str = None,
source_title: str = None,
researcher: str = None
) -> List[Dict]:
"""Chunk research with full source attribution."""
chunks = chunk_research_notes(content)
for chunk in chunks:
chunk["metadata"].update({
"source_url": source_url,
"source_title": source_title,
"researcher": researcher
})
return chunks
Metadata Schema:
type: research
topic: extracted or assigned topic
source_url: where the info came from
source_title: title of source
researcher: who did the research
ingested_at: timestamp
word_count: chunk size