| name | document-inventory |
| description | Scan and catalog document collections with metadata extraction, categorization, and statistics. Use for auditing document libraries, preparing for knowledge base creation, or understanding large file collections. |
Document Inventory Skill
Overview
This skill scans document collections (PDFs, Word docs, text files) and creates a structured inventory with metadata, automatic categorization, and collection statistics. Essential first step before building knowledge bases.
When to Use
- Auditing large document libraries before processing
- Understanding the scope of a document collection
- Categorizing documents by type, source, or content
- Preparing inventories for knowledge base creation
- Generating reports on document collections
- Identifying duplicates or organizing files
Features
- Recursive scanning - Process nested directories
- Metadata extraction - Size, dates, page counts
- Auto-categorization - Pattern-based classification
- Statistics generation - Collection summaries
- SQLite storage - Queryable inventory database
- Multiple formats - PDF, DOCX, TXT, and more
Implementation
Core Inventory Builder
#!/usr/bin/env python3
"""Document inventory builder."""
import sqlite3
import os
from pathlib import Path
from datetime import datetime
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class DocumentInventory:
"""Build and manage document inventories."""
SUPPORTED_EXTENSIONS = {
'.pdf': 'PDF',
'.docx': 'Word',
'.doc': 'Word',
'.txt': 'Text',
'.md': 'Markdown',
'.xlsx': 'Excel',
'.xls': 'Excel',
'.pptx': 'PowerPoint',
'.ppt': 'PowerPoint',
}
def __init__(self, db_path):
self.db_path = db_path
self.conn = sqlite3.connect(db_path, timeout=30)
self._setup_tables()
def _setup_tables(self):
cursor = self.conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY,
filename TEXT NOT NULL,
filepath TEXT UNIQUE NOT NULL,
extension TEXT,
file_type TEXT,
category TEXT,
file_size INTEGER,
created_date TEXT,
modified_date TEXT,
parent_dir TEXT,
depth INTEGER,
scanned_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_category ON documents(category)
''')
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_extension ON documents(extension)
''')
self.conn.commit()
def scan_directory(self, root_path):
"""Scan directory and build inventory."""
root = Path(root_path).resolve()
logger.info(f"Scanning: {root}")
count = 0
for filepath in root.rglob('*'):
if filepath.is_file():
ext = filepath.suffix.lower()
if ext in self.SUPPORTED_EXTENSIONS:
self._add_document(filepath, root)
count += 1
if count % 500 == 0:
logger.info(f"Scanned {count} documents...")
self.conn.commit()
self.conn.commit()
logger.info(f"Scan complete: {count} documents found")
return count
def _add_document(self, filepath, root):
"""Add document to inventory."""
cursor = self.conn.cursor()
try:
stat = filepath.stat()
ext = filepath.suffix.lower()
cursor.execute('''
INSERT OR REPLACE INTO documents
(filename, filepath, extension, file_type, category,
file_size, created_date, modified_date, parent_dir, depth)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
filepath.name,
str(filepath),
ext,
self.SUPPORTED_EXTENSIONS.get(ext, 'Unknown'),
self._categorize(filepath),
stat.st_size,
datetime.fromtimestamp(stat.st_ctime).isoformat(),
datetime.fromtimestamp(stat.st_mtime).isoformat(),
str(filepath.parent),
len(filepath.relative_to(root).parts) - 1
))
except Exception as e:
logger.warning(f"Error adding {filepath}: {e}")
def _categorize(self, filepath):
"""Auto-categorize document based on patterns."""
name = filepath.name.upper()
path_str = str(filepath).upper()
# Industry standard patterns
patterns = {
'API': 'API',
'ISO': 'ISO',
'ASME': 'ASME',
'DNV': 'DNV',
'NORSOK': 'NORSOK',
'BSI': 'BSI',
'ASTM': 'ASTM',
'AWS': 'AWS',
'ABS': 'ABS',
'AISC': 'AISC',
'IEEE': 'IEEE',
}
for pattern, category in patterns.items():
if pattern in name or pattern in path_str:
return category
# Path-based categorization
path_categories = {
'STANDARD': 'Standards',
'SPEC': 'Specifications',
'MANUAL': 'Manuals',
'GUIDE': 'Guides',
'REPORT': 'Reports',
'DRAWING': 'Drawings',
'PROCEDURE': 'Procedures',
}
for pattern, category in path_categories.items():
if pattern in path_str:
return category
return 'Unknown'
def get_statistics(self):
"""Get inventory statistics."""
cursor = self.conn.cursor()
stats = {}
# Total count
cursor.execute('SELECT COUNT(*) FROM documents')
stats['total_documents'] = cursor.fetchone()[0]
# Total size
cursor.execute('SELECT SUM(file_size) FROM documents')
total_bytes = cursor.fetchone()[0] or 0
stats['total_size_mb'] = round(total_bytes / (1024 * 1024), 2)
# By file type
cursor.execute('''
SELECT file_type, COUNT(*), SUM(file_size)
FROM documents
GROUP BY file_type
ORDER BY COUNT(*) DESC
''')
stats['by_type'] = {
row[0]: {'count': row[1], 'size_mb': round((row[2] or 0) / 1024 / 1024, 2)}
for row in cursor.fetchall()
}
# By category
cursor.execute('''
SELECT category, COUNT(*)
FROM documents
GROUP BY category
ORDER BY COUNT(*) DESC
''')
stats['by_category'] = dict(cursor.fetchall())
# By extension
cursor.execute('''
SELECT extension, COUNT(*)
FROM documents
GROUP BY extension
ORDER BY COUNT(*) DESC
''')
stats['by_extension'] = dict(cursor.fetchall())
return stats
def search(self, query, category=None, file_type=None, limit=50):
"""Search inventory."""
cursor = self.conn.cursor()
sql = 'SELECT filename, filepath, category, file_size FROM documents WHERE 1=1'
params = []
if query:
sql += ' AND filename LIKE ?'
params.append(f'%{query}%')
if category:
sql += ' AND category = ?'
params.append(category)
if file_type:
sql += ' AND file_type = ?'
params.append(file_type)
sql += ' ORDER BY filename LIMIT ?'
params.append(limit)
cursor.execute(sql, params)
return cursor.fetchall()
def export_csv(self, output_path):
"""Export inventory to CSV."""
import csv
cursor = self.conn.cursor()
cursor.execute('SELECT * FROM documents')
columns = [desc[0] for desc in cursor.description]
with open(output_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(columns)
writer.writerows(cursor.fetchall())
logger.info(f"Exported to {output_path}")
CLI Interface
#!/usr/bin/env python3
"""Document Inventory CLI."""
import argparse
import json
def main():
parser = argparse.ArgumentParser(description='Document Inventory Tool')
subparsers = parser.add_subparsers(dest='command', help='Commands')
# Scan command
scan_parser = subparsers.add_parser('scan', help='Scan directory')
scan_parser.add_argument('path', help='Directory to scan')
scan_parser.add_argument('--db', default='inventory.db', help='Database path')
# Stats command
stats_parser = subparsers.add_parser('stats', help='Show statistics')
stats_parser.add_argument('--db', default='inventory.db', help='Database path')
stats_parser.add_argument('--json', action='store_true', help='Output as JSON')
# Search command
search_parser = subparsers.add_parser('search', help='Search inventory')
search_parser.add_argument('query', help='Search query')
search_parser.add_argument('--db', default='inventory.db', help='Database path')
search_parser.add_argument('--category', help='Filter by category')
search_parser.add_argument('--type', help='Filter by file type')
# Export command
export_parser = subparsers.add_parser('export', help='Export to CSV')
export_parser.add_argument('output', help='Output CSV path')
export_parser.add_argument('--db', default='inventory.db', help='Database path')
args = parser.parse_args()
if args.command == 'scan':
inventory = DocumentInventory(args.db)
count = inventory.scan_directory(args.path)
print(f"\nScanned {count} documents")
stats = inventory.get_statistics()
print(f"Total size: {stats['total_size_mb']} MB")
print(f"\nBy category:")
for cat, count in list(stats['by_category'].items())[:10]:
print(f" {cat}: {count}")
elif args.command == 'stats':
inventory = DocumentInventory(args.db)
stats = inventory.get_statistics()
if args.json:
print(json.dumps(stats, indent=2))
else:
print(f"Total Documents: {stats['total_documents']}")
print(f"Total Size: {stats['total_size_mb']} MB")
print(f"\nBy Type:")
for t, data in stats['by_type'].items():
print(f" {t}: {data['count']} ({data['size_mb']} MB)")
print(f"\nBy Category:")
for cat, count in list(stats['by_category'].items())[:15]:
print(f" {cat}: {count}")
elif args.command == 'search':
inventory = DocumentInventory(args.db)
results = inventory.search(
args.query,
category=args.category,
file_type=args.type
)
print(f"Found {len(results)} results:\n")
for filename, filepath, category, size in results:
size_kb = size / 1024
print(f" [{category:10}] {filename} ({size_kb:.1f} KB)")
elif args.command == 'export':
inventory = DocumentInventory(args.db)
inventory.export_csv(args.output)
else:
parser.print_help()
if __name__ == '__main__':
main()
Report Generator
def generate_report(db_path, output_path):
"""Generate HTML inventory report."""
inventory = DocumentInventory(db_path)
stats = inventory.get_statistics()
html = f"""
<!DOCTYPE html>
<html>
<head>
<title>Document Inventory Report</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 40px; }}
h1 {{ color: #333; }}
.stat-box {{ background: #f5f5f5; padding: 20px; margin: 10px 0; border-radius: 8px; }}
.stat-value {{ font-size: 2em; font-weight: bold; color: #2196F3; }}
table {{ border-collapse: collapse; width: 100%; margin: 20px 0; }}
th, td {{ border: 1px solid #ddd; padding: 12px; text-align: left; }}
th {{ background: #2196F3; color: white; }}
tr:nth-child(even) {{ background: #f9f9f9; }}
</style>
</head>
<body>
<h1>Document Inventory Report</h1>
<p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
<div class="stat-box">
<div class="stat-value">{stats['total_documents']:,}</div>
<div>Total Documents</div>
</div>
<div class="stat-box">
<div class="stat-value">{stats['total_size_mb']:,.1f} MB</div>
<div>Total Size</div>
</div>
<h2>By File Type</h2>
<table>
<tr><th>Type</th><th>Count</th><th>Size (MB)</th></tr>
{''.join(f"<tr><td>{t}</td><td>{d['count']}</td><td>{d['size_mb']}</td></tr>"
for t, d in stats['by_type'].items())}
</table>
<h2>By Category</h2>
<table>
<tr><th>Category</th><th>Count</th></tr>
{''.join(f"<tr><td>{c}</td><td>{n}</td></tr>"
for c, n in stats['by_category'].items())}
</table>
</body>
</html>
"""
with open(output_path, 'w') as f:
f.write(html)
print(f"Report generated: {output_path}")
Custom Categorization
Extend with Your Patterns
# Add custom patterns for your domain
CUSTOM_PATTERNS = {
'SPEC': 'Specifications',
'DWG': 'Drawings',
'REV': 'Revisions',
'APPROVED': 'Approved',
'DRAFT': 'Draft',
'SUPERSEDED': 'Superseded',
}
def categorize_custom(filepath):
name = filepath.name.upper()
for pattern, category in CUSTOM_PATTERNS.items():
if pattern in name:
return category
return 'Uncategorized'
Multi-Level Categories
def categorize_hierarchical(filepath):
"""Create hierarchical categories."""
name = filepath.name.upper()
# Primary category
primary = 'General'
if 'API' in name:
primary = 'API Standards'
elif 'ISO' in name:
primary = 'ISO Standards'
# Secondary category
secondary = 'Other'
if 'DESIGN' in name:
secondary = 'Design'
elif 'SAFETY' in name:
secondary = 'Safety'
elif 'QUALITY' in name:
secondary = 'Quality'
return f"{primary}/{secondary}"
Best Practices
- Scan before processing - Always inventory first
- Use SQLite timeout -
timeout=30for concurrent access - Batch commits - Commit every 500 files
- Handle errors gracefully - Log and continue on failures
- Export for review - Generate CSV/HTML for stakeholders
- Update incrementally - Use
INSERT OR REPLACE
Example Usage
# Scan directory
python inventory.py scan /path/to/documents --db inventory.db
# View statistics
python inventory.py stats --db inventory.db
# Search
python inventory.py search "API" --category "Standards"
# Export to CSV
python inventory.py export inventory.csv --db inventory.db
Related Skills
knowledge-base-builder- Build searchable database after inventorypdf-text-extractor- Extract text from inventoried PDFssemantic-search-setup- Add AI search capabilities
Version History
- 1.0.0 (2024-10-15): Initial release with SQLite storage, auto-categorization, CLI interface