# Jay Mount Consulting — robots.txt
# Content licensing: https://jaymountconsulting.com/ai-policy
# Reserved under EU Copyright Directive Article 4 (TDM opt-out).
#
# Rules below opt OUT of AI model training while opting IN to search,
# answer-engine citation, and link previews. Specific UA blocks override
# the catch-all at the bottom.

# === ALLOW: search engines (organic search) ===

User-Agent: Googlebot
User-Agent: Googlebot-News
User-Agent: Googlebot-Image
User-Agent: Googlebot-Video
Disallow: /portal/
Disallow: /client/
Disallow: /ops/
Disallow: /admin/
Disallow: /api/

User-Agent: Bingbot
User-Agent: msnbot
User-Agent: msnbot-media
Disallow: /portal/
Disallow: /client/
Disallow: /ops/
Disallow: /admin/
Disallow: /api/

User-Agent: DuckDuckBot
Disallow: /portal/
Disallow: /client/
Disallow: /ops/
Disallow: /admin/
Disallow: /api/

User-Agent: Slurp
Disallow: /portal/
Disallow: /client/
Disallow: /ops/
Disallow: /admin/
Disallow: /api/

User-Agent: Mojeek
Disallow: /portal/
Disallow: /client/
Disallow: /ops/
Disallow: /admin/
Disallow: /api/

User-Agent: Qwantify
Disallow: /portal/
Disallow: /client/
Disallow: /ops/
Disallow: /admin/
Disallow: /api/

# === ALLOW: AI answer engines (citation-based, drives traffic to us) ===

# Google AI Overviews and Gemini citation (drives traffic)
User-Agent: Google-Extended
Disallow: /portal/
Disallow: /client/
Disallow: /ops/
Disallow: /admin/
Disallow: /api/

# OpenAI — search/citation paths (not training)
User-Agent: OAI-SearchBot
User-Agent: ChatGPT-User
Disallow: /portal/
Disallow: /client/
Disallow: /ops/
Disallow: /admin/
Disallow: /api/

# Anthropic Claude — search/citation paths (not training)
User-Agent: claude-web
User-Agent: Claude-SearchBot
User-Agent: Claude-User
Disallow: /portal/
Disallow: /client/
Disallow: /ops/
Disallow: /admin/
Disallow: /api/

# Perplexity — RAG with on-page citations
User-Agent: PerplexityBot
User-Agent: Perplexity-User
Disallow: /portal/
Disallow: /client/
Disallow: /ops/
Disallow: /admin/
Disallow: /api/

# Apple search/Siri (not Apple Intelligence training)
User-Agent: Applebot
Disallow: /portal/
Disallow: /client/
Disallow: /ops/
Disallow: /admin/
Disallow: /api/

# You.com answer engine
User-Agent: YouBot
Disallow: /portal/
Disallow: /client/
Disallow: /ops/
Disallow: /admin/
Disallow: /api/

# === ALLOW: link previews + social ===

User-Agent: FacebookExternalHit
User-Agent: Twitterbot
User-Agent: LinkedInBot
User-Agent: Slackbot-LinkExpanding
User-Agent: Discordbot
User-Agent: TelegramBot
User-Agent: WhatsApp
Disallow: /portal/
Disallow: /client/
Disallow: /ops/
Disallow: /admin/
Disallow: /api/

# === ALLOW: archival / SEO tooling (neutral) ===

User-Agent: ia_archiver
User-Agent: archive.org_bot
Disallow: /portal/
Disallow: /client/
Disallow: /ops/
Disallow: /admin/
Disallow: /api/

# === BLOCK: AI model training crawlers ===

# OpenAI training
User-Agent: GPTBot
Disallow: /

# Anthropic training (legacy + current)
User-Agent: ClaudeBot
Disallow: /

User-Agent: anthropic-ai
Disallow: /

User-Agent: GoogleOther
Disallow: /

User-Agent: GoogleOther-Image
Disallow: /

User-Agent: GoogleOther-Video
Disallow: /

# Apple Intelligence training (opt-out via Applebot-Extended)
User-Agent: Applebot-Extended
Disallow: /

# Meta AI / Llama training
User-Agent: meta-externalagent
Disallow: /

User-Agent: FacebookBot
Disallow: /

# Common Crawl (feeds LAION + many AI training datasets)
User-Agent: CCBot
Disallow: /

# ByteDance / TikTok (aggressive scraper, training)
User-Agent: Bytespider
Disallow: /

# Huawei (Petal AI training)
User-Agent: PetalBot
Disallow: /

# Image AI training
User-Agent: ImagesiftBot
Disallow: /

User-Agent: img2dataset
Disallow: /

# Cohere training
User-Agent: cohere-ai
User-Agent: cohere-training-data-crawler
Disallow: /

# Diffbot (commercial scraper, frequently feeds AI training)
User-Agent: Diffbot
Disallow: /

# Omgili (feeds many AI training datasets)
User-Agent: Omgilibot
User-Agent: Omgili
Disallow: /

# Misc training scrapers
User-Agent: FriendlyCrawler
User-Agent: peer39_crawler
User-Agent: peer39_crawler/1.0
User-Agent: AwarioBot
User-Agent: AwarioRssBot
User-Agent: AwarioSmartBot
User-Agent: DataForSeoBot
User-Agent: Timpibot
User-Agent: VelenPublicWebCrawler
User-Agent: webzio-extended
User-Agent: ICC-Crawler
User-Agent: ISSCyberRiskCrawler
User-Agent: Kangaroo Bot
User-Agent: SemanticScholarBot
Disallow: /

# === DEFAULT: unknown / unlisted crawlers ===
# Conservative: allow public site, block portal/admin/api. New AI
# training crawlers that don't honor a specific block above will fall
# through here; add them explicitly when identified.

User-Agent: *
Disallow: /portal/
Disallow: /client/
Disallow: /ops/
Disallow: /admin/
Disallow: /api/

Sitemap: https://jaymountconsulting.com/sitemap.xml

# === CONTENT SIGNALS (Cloudflare draft standard) ===
# search=yes      — content may be indexed for organic search
# ai-input=yes    — content may be retrieved as input for AI answer engines
#                   provided answers cite the original source
# ai-train=no     — content is NOT licensed for AI model training
# These tokens supplement the per-UA blocks above and assert our TDM
# (Text and Data Mining) reservation under EU Copyright Directive
# Article 4 + comparable jurisdictions. Placed AFTER Sitemap so any
# strict parser that aborts on this unknown directive (e.g. Bing Webmaster
# Tools tester flags it as an error) has already consumed the Sitemap.
Content-Signal: search=yes, ai-input=yes, ai-train=no