User-agent: *
Allow: /
Allow: /strain/
Allow: /brand/
Allow: /dispensary/
Allow: /grower/
Allow: /terpenes
Allow: /terpene/
Allow: /references
Allow: /references/
Allow: /wiki/
Allow: /hb568/

Disallow: /admin
Disallow: /sysadmin
Disallow: /moderate
Disallow: /account
Disallow: /scan
Disallow: /journal
Disallow: /user-entry/
Disallow: /setup
Disallow: /auth
Disallow: /signup
Disallow: /capture/
Disallow: /api/

# ──────────────────────────────────────────────────────────────────────
# AI bot policy
# ──────────────────────────────────────────────────────────────────────
#
# We split AI crawlers into two buckets and treat them differently.
#
# 1. SEARCH-TIME FETCHERS — allowed.
#    These bots fetch pages on-demand when a user asks an AI engine a
#    question; the resulting AI answer includes attribution links back
#    to the source. Allowing them means TerpTrace's wiki + strain pages
#    can show up as cited sources in ChatGPT Search, Perplexity, and
#    Google AI Overviews — which is the primary discovery surface for
#    long-tail patient questions ("what does total THC mean", "best
#    strains for sleep louisiana", etc.). Same rules as Googlebot;
#    these bots see only the public surface the * block allows.
#
# 2. TRAINING-CORPUS CRAWLERS — disallowed.
#    These bots bulk-scrape content into training data for future model
#    versions. They produce no attribution, no traffic, no transparency
#    around how the content is used or retained. Patient-submitted
#    images and lab data on TerpTrace are copyrighted by their
#    contributors; we do not consent to ingestion into model training
#    corpora, embeddings, or commercial scrape products. See /terms.
#
# Bot families are split per the public consensus on opt-out user-
# agents documented by OpenAI, Anthropic, Google, Apple, Perplexity,
# Common Crawl, ByteDance, Cohere, Diffbot, Omgili, and Meta.

# --- search-time fetchers (ALLOWED on the same public surface as Googlebot) ---
#
# Per RFC 9309, most crawlers apply only the MOST SPECIFIC matching
# user-agent group and IGNORE the wildcard (*) group. So if we list a
# search-time fetcher in its own group, that group must repeat every
# Disallow we want to enforce -- otherwise the bot inherits a wide-
# open policy and crawls /admin, /account, /journal, /api/, etc.
#
# All three search-time fetchers share the same policy, so we group them
# under a single multi-line User-agent header (spec-supported, RFC 9309
# section 2.2.1) and apply the same Allow/Disallow rules the wildcard
# group enforces for the public * crawler.
#
# NOTE: Google-Extended is intentionally NOT listed here. Despite its
# name, Google-Extended is not a separate crawler -- it's Google's
# opt-out control token governing whether content already fetched by
# Googlebot can be re-used for Gemini training and Vertex AI grounding.
# Real-time Google AI Overviews in Search are powered by the regular
# Googlebot index (already allowed via the * group), so listing
# Google-Extended as "allowed" would silently opt the site into AI
# training/grounding without buying any additional discovery surface.
# It lives in the training-corpus DISALLOW group below.

User-agent: ChatGPT-User
User-agent: OAI-SearchBot
User-agent: PerplexityBot
Allow: /
Allow: /strain/
Allow: /brand/
Allow: /dispensary/
Allow: /grower/
Allow: /terpenes
Allow: /terpene/
Allow: /references
Allow: /references/
Allow: /wiki/
Allow: /hb568/
Disallow: /admin
Disallow: /sysadmin
Disallow: /moderate
Disallow: /account
Disallow: /scan
Disallow: /journal
Disallow: /user-entry/
Disallow: /setup
Disallow: /auth
Disallow: /signup
Disallow: /capture/
Disallow: /api/

# --- training-corpus crawlers (DISALLOWED) ---

User-agent: GPTBot
Disallow: /

# Google-Extended: opt-out token for Gemini training + Vertex AI
# grounding. Disallowing here keeps TerpTrace content out of Google's
# AI training pipeline without affecting Googlebot's regular search
# indexing (which is still permitted via the * group above and powers
# Search AI Overviews on its own).
User-agent: Google-Extended
Disallow: /

User-agent: ClaudeBot
Disallow: /

User-agent: anthropic-ai
Disallow: /

User-agent: Claude-Web
Disallow: /

User-agent: CCBot
Disallow: /

User-agent: Applebot-Extended
Disallow: /

User-agent: Bytespider
Disallow: /

User-agent: cohere-ai
Disallow: /

User-agent: Diffbot
Disallow: /

User-agent: Omgilibot
Disallow: /

User-agent: FacebookBot
Disallow: /

User-agent: Meta-ExternalAgent
Disallow: /

Sitemap: https://www.terptrace.com/sitemap.xml