# Karafka Documentation - Robots.txt
# Updated: 2025-02-12
#
# Policy: Allow user-facing AI assistants (RAG/retrieval for users)
#         Block training data collection crawlers

# ============================================
# USER-FACING AI ASSISTANTS (ALLOWED)
# ============================================
# These bots serve users in real-time with our docs
# Examples: Claude Code, ChatGPT browsing, Perplexity search

User-agent: ChatGPT-User
User-agent: Claude-Web
User-agent: anthropic-ai
User-agent: PerplexityBot
User-agent: YouBot
User-agent: Applebot-Extended
Allow: /
Allow: /docs/
Allow: /llms.txt

# ============================================
# TRAINING DATA CRAWLERS (BLOCKED)
# ============================================
# These bots collect data for AI model training

# OpenAI Training Crawler
User-agent: GPTBot
Disallow: /

# Common Crawl (used by many AI companies for training)
User-agent: CCBot
Disallow: /

# Google Bard/Gemini Training
User-agent: Google-Extended
Disallow: /

# Anthropic Training Crawler
User-agent: anthropic-training
Disallow: /

# Meta/Facebook AI Training
User-agent: FacebookBot
User-agent: Meta-ExternalAgent
Disallow: /

# Amazon Training
User-agent: Amazonbot
Disallow: /

# ByteDance/TikTok AI Training
User-agent: Bytespider
Disallow: /

# Diffbot Training
User-agent: Diffbot
Disallow: /

# Cohere Training
User-agent: cohere-ai
Disallow: /

# AI2 (Allen Institute) Training
User-agent: AI2Bot
Disallow: /

# Omgili Training
User-agent: omgili
Disallow: /

# Scrapy-based crawlers
User-agent: Scrapy
Disallow: /

# ============================================
# REGULAR SEARCH ENGINES (ALLOWED)
# ============================================

User-agent: Googlebot
Allow: /

User-agent: Bingbot
Allow: /

User-agent: Slurp
Allow: /

User-agent: DuckDuckBot
Allow: /

# ============================================
# DEFAULT (CAUTIOUS)
# ============================================

User-agent: *
Allow: /
Disallow: /admin/
Disallow: /releases/

# Sitemap
Sitemap: https://karafka.io/docs/sitemap.xml