# PureSpec Environmental — robots.txt
#
# Policy: open to every search engine and every AI surface that cites sources
# back to the originating site (citations drive future inspection bookings).
# Closed to aggressive scrapers, content harvesters, competitor SEO research
# tools, and AI training-only crawlers that provide no discovery value.
#
# Sitemap: https://www.purespecenvironmental.com/sitemap.xml
# Last updated: 2026-05-29

# ============================================================================
# Major search engines — full access
# ============================================================================

User-agent: Googlebot
Allow: /

User-agent: Googlebot-Image
Allow: /

User-agent: Googlebot-Video
Allow: /

User-agent: Googlebot-News
Allow: /

User-agent: Storebot-Google
Allow: /

User-agent: GoogleOther
Allow: /

User-agent: Bingbot
Allow: /

User-agent: BingPreview
Allow: /

User-agent: DuckDuckBot
Allow: /

User-agent: Slurp
Allow: /

User-agent: Applebot
Allow: /

User-agent: YandexBot
Allow: /

User-agent: Baiduspider
Allow: /

# ============================================================================
# Social link-preview bots — full access (rich previews when shared)
# ============================================================================

User-agent: Twitterbot
Allow: /

User-agent: facebookexternalhit
Allow: /

User-agent: LinkedInBot
Allow: /

User-agent: Pinterestbot
Allow: /

User-agent: WhatsApp
Allow: /

User-agent: Discordbot
Allow: /

User-agent: TelegramBot
Allow: /

User-agent: SkypeUriPreview
Allow: /

User-agent: Slackbot-LinkExpanding
Allow: /

User-agent: Iframely
Allow: /

# ============================================================================
# AI search & answer engines that CITE back — full access
# This is where future organic discovery lives. Someone asking ChatGPT or
# Claude "independent mold inspector Orlando" needs to find PureSpec here.
# ============================================================================

# OpenAI — ChatGPT search + user-triggered fetches
User-agent: GPTBot
Allow: /

User-agent: OAI-SearchBot
Allow: /

User-agent: ChatGPT-User
Allow: /

# Anthropic — Claude search + user-triggered fetches
User-agent: ClaudeBot
Allow: /

User-agent: anthropic-ai
Allow: /

User-agent: Claude-Web
Allow: /

# Google AI Overviews & Gemini
User-agent: Google-Extended
Allow: /

# Perplexity — crawler + user-triggered
User-agent: PerplexityBot
Allow: /

User-agent: Perplexity-User
Allow: /

# Apple Intelligence (separate from Applebot search)
User-agent: Applebot-Extended
Allow: /

# You.com
User-agent: YouBot
Allow: /

# Cohere
User-agent: cohere-ai
Allow: /

# Mistral
User-agent: MistralAI-User
Allow: /

# Meta AI — crawler + user-triggered fetcher
User-agent: Meta-ExternalAgent
Allow: /

User-agent: Meta-ExternalFetcher
Allow: /

# ============================================================================
# BLOCKED — bulk training crawlers, aggressive scrapers, content harvesters
# ============================================================================

# Common Crawl — feeds bulk AI training datasets without per-fetch citation
User-agent: CCBot
Disallow: /

# Bytespider (ByteDance / TikTok) — aggressive, opaque, high volume
User-agent: Bytespider
Disallow: /

# Amazonbot — Alexa AI training, no discovery benefit for a local services site
User-agent: Amazonbot
Disallow: /

# Diffbot — commercial scraping-as-a-service
User-agent: Diffbot
Disallow: /

# Omgili / Webz.io — content aggregation for resale
User-agent: Omgilibot
Disallow: /

User-agent: omgili
Disallow: /

User-agent: Webzio-Extended
Disallow: /

# Petalbot (Huawei) — aggressive crawler with limited regional value
User-agent: PetalBot
Disallow: /

# Yeti (Naver) — Korean-market, low relevance for Central FL services
User-agent: Yeti
Disallow: /

# ImagesiftBot — image scraping without citation
User-agent: ImagesiftBot
Disallow: /

# Timpibot — search engine that doesn't return meaningful traffic
User-agent: Timpibot
Disallow: /

# ScalenutBot — AI content tool that harvests for training
User-agent: ScalenutBot
Disallow: /

# ============================================================================
# Competitor SEO research tools — deny free intelligence
# ============================================================================

User-agent: AhrefsBot
Disallow: /

User-agent: AhrefsSiteAudit
Disallow: /

User-agent: SemrushBot
Disallow: /

User-agent: SemrushBot-SA
Disallow: /

User-agent: MJ12bot
Disallow: /

User-agent: DotBot
Disallow: /

User-agent: rogerbot
Disallow: /

User-agent: BLEXBot
Disallow: /

User-agent: MegaIndex
Disallow: /

User-agent: MegaIndex.ru
Disallow: /

User-agent: SerpstatBot
Disallow: /

User-agent: DataForSeoBot
Disallow: /

User-agent: SurdotlyBot
Disallow: /

User-agent: BacklinkCrawler
Disallow: /

User-agent: linkfluence
Disallow: /

User-agent: SeekportBot
Disallow: /

# ============================================================================
# Site cloning / offline mirroring tools — block plagiarism and bandwidth abuse
# ============================================================================

User-agent: HTTrack
Disallow: /

User-agent: WebCopier
Disallow: /

User-agent: WebReaper
Disallow: /

User-agent: WebStripper
Disallow: /

User-agent: Offline Explorer
Disallow: /

User-agent: SiteSnagger
Disallow: /

User-agent: WebZIP
Disallow: /

# ============================================================================
# Default — everything else gets standard access with sensible exclusions.
# Crawl-delay omitted intentionally: Googlebot ignores it; other major bots
# self-throttle; the site is CDN-served and can absorb normal crawl rates.
# ============================================================================

User-agent: *
Allow: /
Disallow: /index.html
Disallow: /404.html
Disallow: /thank-you/
Disallow: /assets/og-default-dark-backup.jpg

# ============================================================================
# Sitemap
# ============================================================================

Sitemap: https://www.purespecenvironmental.com/sitemap.xml