| name | guardrails-safety |
| description | Protecting AI applications - input/output guards, toxicity detection, PII protection, injection defense, constitutional AI. Use when securing AI systems, preventing misuse, or ensuring compliance. |
Guardrails & Safety Skill
Protecting AI applications from misuse.
Input Guardrails
class InputGuard:
def __init__(self):
self.toxicity = load_toxicity_model()
self.pii = PIIDetector()
self.injection = InjectionDetector()
def check(self, text):
result = {"allowed": True, "issues": []}
# Toxicity
if self.toxicity.predict(text) > 0.7:
result["allowed"] = False
result["issues"].append("toxic")
# PII
pii = self.pii.detect(text)
if pii:
result["issues"].append(f"pii: {pii}")
text = self.pii.redact(text)
# Injection
if self.injection.detect(text):
result["allowed"] = False
result["issues"].append("injection")
result["sanitized"] = text
return result
Output Guardrails
class OutputGuard:
def check(self, output, context=None):
result = {"allowed": True, "issues": []}
# Factuality
if context:
if self.fact_checker.check(output, context) < 0.7:
result["issues"].append("hallucination")
# Toxicity
if self.toxicity.predict(output) > 0.5:
result["allowed"] = False
result["issues"].append("toxic")
# Citations
invalid = self.citation_validator.check(output)
if invalid:
result["issues"].append(f"bad_citations: {len(invalid)}")
return result
Injection Detection
class InjectionDetector:
PATTERNS = [
r"ignore (previous|all) instructions",
r"forget (your|all) (instructions|rules)",
r"you are now",
r"new persona",
r"act as",
r"pretend to be",
r"disregard",
]
def detect(self, text):
text_lower = text.lower()
for pattern in self.PATTERNS:
if re.search(pattern, text_lower):
return True
return False
Constitutional AI
class ConstitutionalFilter:
def __init__(self, principles):
self.principles = principles
self.critic = load_model("critic")
self.reviser = load_model("reviser")
def filter(self, response):
for principle in self.principles:
critique = self.critic.generate(f"""
Does this violate: "{principle}"?
Response: {response}
""")
if "violates" in critique.lower():
response = self.reviser.generate(f"""
Rewrite to comply with: "{principle}"
Original: {response}
Critique: {critique}
""")
return response
PRINCIPLES = [
"Do not provide harmful instructions",
"Do not reveal personal information",
"Acknowledge uncertainty",
"Do not fabricate facts",
]
PII Protection
class PIIDetector:
PATTERNS = {
"email": r"\b[\w.-]+@[\w.-]+\.\w+\b",
"phone": r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b",
"ssn": r"\b\d{3}-\d{2}-\d{4}\b",
"credit_card": r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b",
}
def detect(self, text):
found = {}
for name, pattern in self.PATTERNS.items():
matches = re.findall(pattern, text)
if matches:
found[name] = matches
return found
def redact(self, text):
for name, pattern in self.PATTERNS.items():
text = re.sub(pattern, f"[{name.upper()}]", text)
return text
Best Practices
- Defense in depth (multiple layers)
- Log all blocked content
- Regular adversarial testing
- Update patterns continuously
- Fail closed (block if uncertain)