Skip to content

Commit 546b2c3

Browse files
committed
feat: block AI crawler from robots.txt
1 parent 75d869f commit 546b2c3

2 files changed

Lines changed: 190 additions & 4 deletions

File tree

_src/robots.njk

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,98 @@ eleventyComputed:
55
---
66
Sitemap: {{ site.url }}/sitemap.xml
77

8-
User-agent: *
9-
Disallow:
8+
# Block all known AI crawlers and assistants
9+
# from using content for training AI models.
10+
# Source: https://robotstxt.com/ai
11+
User-Agent: GPTBot
12+
User-Agent: ClaudeBot
13+
User-Agent: Claude-User
14+
User-Agent: Claude-SearchBot
15+
User-Agent: CCBot
16+
User-Agent: Google-Extended
17+
User-Agent: Applebot-Extended
18+
User-Agent: Facebookbot
19+
User-Agent: Meta-ExternalAgent
20+
User-Agent: Meta-ExternalFetcher
21+
User-Agent: diffbot
22+
User-Agent: PerplexityBot
23+
User-Agent: Perplexity‑User
24+
User-Agent: Omgili
25+
User-Agent: Omgilibot
26+
User-Agent: webzio-extended
27+
User-Agent: ImagesiftBot
28+
User-Agent: Bytespider
29+
User-Agent: TikTokSpider
30+
User-Agent: Amazonbot
31+
User-Agent: Youbot
32+
User-Agent: SemrushBot-OCOB
33+
User-Agent: Petalbot
34+
User-Agent: VelenPublicWebCrawler
35+
User-Agent: TurnitinBot
36+
User-Agent: Timpibot
37+
User-Agent: OAI-SearchBot
38+
User-Agent: ICC-Crawler
39+
User-Agent: AI2Bot
40+
User-Agent: AI2Bot-Dolma
41+
User-Agent: DataForSeoBot
42+
User-Agent: AwarioBot
43+
User-Agent: AwarioSmartBot
44+
User-Agent: AwarioRssBot
45+
User-Agent: Google-CloudVertexBot
46+
User-Agent: PanguBot
47+
User-Agent: Kangaroo Bot
48+
User-Agent: Sentibot
49+
User-Agent: img2dataset
50+
User-Agent: Meltwater
51+
User-Agent: Seekr
52+
User-Agent: peer39_crawler
53+
User-Agent: cohere-ai
54+
User-Agent: cohere-training-data-crawler
55+
User-Agent: DuckAssistBot
56+
User-Agent: Scrapy
57+
User-Agent: Cotoyogi
58+
User-Agent: aiHitBot
59+
User-Agent: Factset_spyderbot
60+
User-Agent: FirecrawlAgent
61+
User-Agent: bedrockbot
62+
User-Agent: DeepSeekBot
63+
User-Agent: GoogleAgent-Mariner
64+
User-Agent: Gemini-Deep-Research
65+
User-Agent: Google-NotebookLM
66+
User-Agent: Google-Agent
67+
User-Agent: GoogleAgent-URLContext
68+
User-Agent: Google-Firebase
69+
User-Agent: MistralAI-User
70+
User-Agent: SemrushBot-FT
71+
User-Agent: SemrushBot-ESI
72+
User-Agent: AddSearchBot
73+
User-Agent: bigsur.ai
74+
User-Agent: Brightbot
75+
User-Agent: Crawlspace
76+
User-Agent: EchoboxBot
77+
User-Agent: FriendlyCrawler
78+
User-Agent: LinerBot
79+
User-Agent: Panscient
80+
User-Agent: Panscient.com
81+
User-Agent: Poseidon Research Crawler
82+
User-Agent: SBIntuitionsBot
83+
User-Agent: TerraCotta
84+
User-Agent: Thinkbot
85+
User-Agent: Yak
86+
User-Agent: YandexAdditional
87+
User-Agent: YandexAdditionalBot
88+
89+
Disallow: /
90+
DisallowAITraining: /
91+
92+
# Block any non-specified AI crawlers (e.g., new
93+
# or unknown bots) from using content for training
94+
# AI models, while allowing the website to be
95+
# indexed and accessed by bots. These directives
96+
# are still experimental and may not be supported
97+
# by all AI crawlers.
98+
User-Agent: *
99+
DisallowAITraining: /
100+
Content-Usage: ai=n
101+
Content-Signal: search=yes, ai-input=no, ai-train=no
102+
Allow: /

docs/robots.txt

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,97 @@
11
Sitemap: https://www.ffoodd.fr/sitemap.xml
22

3-
User-agent: *
4-
Disallow:
3+
# Block all known AI crawlers and assistants
4+
# from using content for training AI models.
5+
# Source: https://robotstxt.com/ai
6+
User-Agent: GPTBot
7+
User-Agent: ClaudeBot
8+
User-Agent: Claude-User
9+
User-Agent: Claude-SearchBot
10+
User-Agent: CCBot
11+
User-Agent: Google-Extended
12+
User-Agent: Applebot-Extended
13+
User-Agent: Facebookbot
14+
User-Agent: Meta-ExternalAgent
15+
User-Agent: Meta-ExternalFetcher
16+
User-Agent: diffbot
17+
User-Agent: PerplexityBot
18+
User-Agent: PerplexityUser
19+
User-Agent: Omgili
20+
User-Agent: Omgilibot
21+
User-Agent: webzio-extended
22+
User-Agent: ImagesiftBot
23+
User-Agent: Bytespider
24+
User-Agent: TikTokSpider
25+
User-Agent: Amazonbot
26+
User-Agent: Youbot
27+
User-Agent: SemrushBot-OCOB
28+
User-Agent: Petalbot
29+
User-Agent: VelenPublicWebCrawler
30+
User-Agent: TurnitinBot
31+
User-Agent: Timpibot
32+
User-Agent: OAI-SearchBot
33+
User-Agent: ICC-Crawler
34+
User-Agent: AI2Bot
35+
User-Agent: AI2Bot-Dolma
36+
User-Agent: DataForSeoBot
37+
User-Agent: AwarioBot
38+
User-Agent: AwarioSmartBot
39+
User-Agent: AwarioRssBot
40+
User-Agent: Google-CloudVertexBot
41+
User-Agent: PanguBot
42+
User-Agent: Kangaroo Bot
43+
User-Agent: Sentibot
44+
User-Agent: img2dataset
45+
User-Agent: Meltwater
46+
User-Agent: Seekr
47+
User-Agent: peer39_crawler
48+
User-Agent: cohere-ai
49+
User-Agent: cohere-training-data-crawler
50+
User-Agent: DuckAssistBot
51+
User-Agent: Scrapy
52+
User-Agent: Cotoyogi
53+
User-Agent: aiHitBot
54+
User-Agent: Factset_spyderbot
55+
User-Agent: FirecrawlAgent
56+
User-Agent: bedrockbot
57+
User-Agent: DeepSeekBot
58+
User-Agent: GoogleAgent-Mariner
59+
User-Agent: Gemini-Deep-Research
60+
User-Agent: Google-NotebookLM
61+
User-Agent: Google-Agent
62+
User-Agent: GoogleAgent-URLContext
63+
User-Agent: Google-Firebase
64+
User-Agent: MistralAI-User
65+
User-Agent: SemrushBot-FT
66+
User-Agent: SemrushBot-ESI
67+
User-Agent: AddSearchBot
68+
User-Agent: bigsur.ai
69+
User-Agent: Brightbot
70+
User-Agent: Crawlspace
71+
User-Agent: EchoboxBot
72+
User-Agent: FriendlyCrawler
73+
User-Agent: LinerBot
74+
User-Agent: Panscient
75+
User-Agent: Panscient.com
76+
User-Agent: Poseidon Research Crawler
77+
User-Agent: SBIntuitionsBot
78+
User-Agent: TerraCotta
79+
User-Agent: Thinkbot
80+
User-Agent: Yak
81+
User-Agent: YandexAdditional
82+
User-Agent: YandexAdditionalBot
83+
84+
Disallow: /
85+
DisallowAITraining: /
86+
87+
# Block any non-specified AI crawlers (e.g., new
88+
# or unknown bots) from using content for training
89+
# AI models, while allowing the website to be
90+
# indexed and accessed by bots. These directives
91+
# are still experimental and may not be supported
92+
# by all AI crawlers.
93+
User-Agent: *
94+
DisallowAITraining: /
95+
Content-Usage: ai=n
96+
Content-Signal: search=yes, ai-input=no, ai-train=no
97+
Allow: /

0 commit comments

Comments
 (0)