Outils personnels

Exemple robots.txt

De wikiGite

Révision datée du 29 juillet 2014 à 10:08 par Frank (discussion | contributions) (Page créée avec « Exemple de fichier robots.txt http://www.arnold-soft.de/robots.txt # BotDoku: de.wetena.com/bot # Doku: www.robotstxt.org # erlaubte robots # google.com Googleb... »)
(diff) ← Version précédente | Voir la version actuelle (diff) | Version suivante → (diff)

Exemple de fichier robots.txt

http://www.arnold-soft.de/robots.txt

# BotDoku: de.wetena.com/bot
# Doku:    www.robotstxt.org

# erlaubte robots
# google.com Googlebot
# bing.com bingbot
# msn.com
# MSIE
# info@netcraft.com

# Hier ein Liste der unerwuenschten robots
# Yandex
# www.infohelfer.de
# warebay.com
# thunderstone.com
# pixray.com
# aihit.com
# ips-agent
# MALC

# metadatalabs.com
User-agent: MLBot
Disallow: /

#  Ahrefs.com (http://ahrefs.com/robot/)
#  IP 5.10.83.36
#  "Mozilla/5.0 (compatible; AhrefsBot/5.0; +http://ahrefs.com/robot/)"
user-agent: AhrefsBot
disallow: /

# ezooms.bot
User-agent: Ezooms
Disallow: /

# domaintools.com
User-agent: SurveyBot
Disallow: /

# www.infohelfer.de
User-agent: Infohelfer
Disallow: /

# www.pixray.com
User-agent: Pixray*
Disallow: /

# warebay.com
User-agent: WBSearchBot
Disallow: /

# aihit.com
User-agent: aiHitBot
Disallow: /

# yandex.com  YandexBot YandexImages
# IP 141.8.147.17
# "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
User-agent: YandexBot
Disallow: /
User-agent: YandexImages
Disallow: /

# U 
User-agent: U
Disallow: /

# unister.de
User-agent: UnisterBot
Disallow: /

# www.Nutch.de
# IP 62.146.2.234, 117.78.13.18
# "Domnutch-Bot/Nutch-1.0 (Domnutch; http://www.Nutch.de/)"
User-agent: nutch-1.4
Disallow: /

User-agent: discobot
Disallow: /

# SEO Spider  spider@spiderlytics.com 
# IP 5.199.136.130
# "Mozilla/5.0 (compatible; Spiderlytics/1.0; +spider@spiderlytics.com)"
User-agent: Spiderlytics
Disallow: /

# Unknown
# IP 207.241.226.239
# "ia_archiver(OS-Wayback)"
User-agent: ia_archiver
Disallow: /

# crawler@alexa.com
# IP 204.236.235.245
# "ia_archiver (+http://www.alexa.com/site/help/webmasters; crawler@alexa.com)"
User-agent: alexa
Disallow: /

# Unknown
# IP 108.59.8.70
# "Mozilla/5.0 (compatible; MJ12bot/v1.4.4; http://www.majestic12.co.uk/bot.php?+)"
User-agent: MJ12bot
Disallow: /

# http://go.mail.ru/help/robots
# IP 217.69.133.253
# "Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/2.0; +http://go.mail.ru/help/robots)"
User-agent: Mail.RU_Bot
Disallow: /

# macinroy.com
# IP 85.25.137.24
# "MacInroy Privacy Auditors. See jarnold.org's privacy violation report: http://jarnold.org.macinroy.com/jarnold.org"
User-agent: MacInroy
Disallow: /

# www.semrush.com/bot.html
# IP 46.229.164.102
# "Mozilla/5.0 (compatible; SemrushBot/0.97; +http://www.semrush.com/bot.html)"
User-agent: SemrushBot
Disallow: /

# http://www.icjobs.de
# IP 85.25.71.40
# "Mozilla/5.0 (X11; U; Linux i686; de; rv:1.9.0.1; compatible; iCjobs Stellenangebote Jobs; http://www.icjobs.de) Gecko/20100401 iCjobs/3.2.3"
User-agent: iCjobs
Disallow: /

# http://fulltext.sblog.cz
# IP 77.75.77.32
# "SeznamBot/3.0 (+http://fulltext.sblog.cz/)"
User-agent: SeznamBot
Disallow: /

# http://webmeup-crawler.com
# IP 108.178.53.146
# "Mozilla/5.0 (compatible; BLEXBot/1.0; +http://webmeup-crawler.com/)"
User-agent: BLEXBot
Disallow: /

# http://siteexplorer.info
# IP 208.43.225.84
# "Mozilla/5.0 (compatible; SiteExplorer/1.0b; +http://siteexplorer.info/)"
User-agent: SiteExplorer
Disallow: /

# www.linkdex.com/about/bots
# IP 54.242.123.170, 23.22.229.75, 54.225.52.217 23.20.126.233
# "Mozilla/5.0 (compatible; linkdexbot/2.0; +http://www.linkdex.com/about/bots/)"
User-agent: linkdexbot
Disallow: /

# www.wotbox.com/bot
# IP 81.144.138.34
# "Wotbox/2.01 (+http://www.wotbox.com/bot/)"
User-agent: Wotbox
Disallow: /

# http://www.domaintuno.com
# IP 192.96.204.42
# "http://www.domaintuno.com/whois/jarnold.org" "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
User-agent: domaintuno
Disallow: /

# unknown addressendeutschland.de
# IP 86.109.249.174
# "http://arnold-soft.de/" "dubaiindex (addressendeutschland.de)"
User-agent: dubaiindex
Disallow: /

# www.pagesinvenotry.com
# IP 130.185.109.243
# "PagesInventory (robot http://www.pagesinvenotry.com)"
User-agent: PagesInventory
Disallow: /

# www.abonti.com
# IP 77.233.225.115
# "Mozilla/5.0 (compatible; Abonti/0.91 - http://www.abonti.com)"
User-agent: Abonti
Disallow: /

# www.backlinktest.com/crawler.html
# IP 46.4.100.231
# "BacklinkCrawler (http://www.backlinktest.com/crawler.html)"
User-agent: BacklinkCrawler
Disallow: /

# http://netcomber.com
# IP 54.227.175.17
# "NCBot http://netcomber.com?st=ba2Tool for finding all their domain names."
User-agent: NCBot
Disallow: /

# Unknown
# IP 69.58.178.58
# "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:14.0; ips-agent) Gecko/20100101 Firefox/14.0.1"
User-agent: ips-agent
Disallow: /

# www.grapeshot.co.uk/crawler.php
# IP 89.145.95.2
# "Mozilla/5.0 (compatible; GrapeshotCrawler/2.0; +http://www.grapeshot.co.uk/crawler.php)"
User-agent: GrapeshotCrawler
Disallow: /

# www.80legs.com/webcrawler.html
# IP 64.125.222.16
# "Mozilla/5.0 (compatible; 008/0.83; http://www.80legs.com/webcrawler.html;) Gecko/2008032620"
User-agent: 008/0.83
Disallow: /
User-agent: 008/0.85
Disallow: /

# it2media.de
# IP 86.109.249.169
# "it2media-domain-crawler/1.0 on crawler-prod.it2media.de"
User-agent: it2media-domain-crawler
Disallow: /

# http://crawler.sistrix.net
# IP 176.9.148.197, IP 176.9.155.226, 5.9.112.66
# "Mozilla/5.0 (compatible; SISTRIX Crawler; http://crawler.sistrix.net/)"
User-agent: SISTRIX
Disallow: /

# www.picsearch.com/bot.html
# IP 217.212.224.183
# "psbot/0.1 (+http://www.picsearch.com/bot.html)"
User-agent: psbot
Disallow: /

# worio.com
# IP 107.22.250.59
# "Mozilla/5.0 (compatible; woriobot +http://worio.com)"
User-agent: woriobot
Disallow: /

# semantissimo.de
# IP 88.198.24.173
# "ssearch_bot (sSearch Crawler; http://www.semantissimo.de)"
User-agent: sSearch
Disallow: /

# www.archive.org/details/archive.org_bot
# IP 207.241.237.102  + .103 (abwechselnd!) + 207.241.226.234
# Mozilla/5.0 (compatible; archive.org_bot; Wayback Machine Live Record; +http://archive.org/details/archive.org_bot)"
User-agent: archive.org_bot
Disallow: /

# +spider@waybackarchive.org
# IP 5.199.136.130
# "Mozilla/5.0 (compatible; waybackarchive.org/1.0; +spider@waybackarchive.org)"
User-agent: waybackarchive.org
Disallow: /

# www.website-datenbank.de
# IP 81.209.177.145
# "netEstate NE Crawler (+http://www.website-datenbank.de/)"
User-agent: netEstate
Disallow: /

# www.compspy.com/spider.html
# IP 68.47.129.55
# "Mozilla/5.0 (compatible; CompSpyBot/1.0; +http://www.compspy.com/spider.html)"
User-agent: CompSpyBot
Disallow: /

# www.seoprofiler.com/bot
# IP 198.199.89.149, 162.243.203.202
# "Mozilla/5.0 (compatible; spbot/4.1.0; +http://OpenLinkProfiler.org/bot )"
User-agent: spbot
Disallow: /

# http://filterdb.iss.net/crawler/
# IP 206.253.226.18
# "Mozilla/5.0 (compatible; oBot/2.3.1; http://filterdb.iss.net/crawler/)"
User-agent: oBot
Disallow: /

# http://www.baidu.com
# 183.60.243.187
# "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0"
User-agent: baidu
Disallow: /

# http://www.exabot.com/go/robot
# IP 178.255.215.69
# "Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)"
User-agent: Exabot
Disallow: /

# www.tiscali.it
# IP 217.73.208.103
# "Mozilla/5.0 (compatible; IstellaBot/1.18.81 +http://www.tiscali.it/)"
User-agent: IstellaBot
Disallow: /

# www.netseer.com/crawler.html
# IP 75.98.9.250
# "Mozilla/5.0 (compatible; NetSeer crawler/2.0; +http://www.netseer.com/crawler.html; crawler@netseer.com)"
User-agent: NetSeer
Disallow: /

# http://www.opensiteexplorer.org/dotbot, help@moz.com
# IP 208.115.113.92
# "Mozilla/5.0 (compatible; DotBot/1.1; http://www.opensiteexplorer.org/dotbot, help@moz.com)"
User-agent: DotBot
Disallow: /

# http://www.proximic.com/info/spider.php# IP 54.211.1.18
# "Mozilla/5.0 (compatible; proximic; +http://www.proximic.com/info/spider.php)"
User-agent: proximic
Disallow: /

# http://commoncrawl.org/faq/
# IP 54.227.12.4
# "CCBot/2.0 (http://commoncrawl.org/faq/)"
User-agent: CCBot
Disallow: /

# 
# IP 130.211.186.147, 146.148.35.52
# "GET / HTTP/1.0" 200 10064 "-" "NerdyBot"
User-agent: NerdyBot
Disallow: /

# http://semalt.semalt.com/crawler.php
# IP 187.79.214.121
# "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"
#User-agent: xxx
#Disallow: /

#
# IP 69.84.207.246
# "LSSRocketCrawler/1.0 LightspeedSystems"
User-agent: LSSRocketCrawler
Disallow: /

# ???
# 50.17.21.141
# "Cliqzbot"
User-agent: Cliqzbot
Disallow: /

User-agent: Mediapartners-Google*
Disallow: /

# standard Einstellungen
User-agent: *
Disallow: /atd/
Disallow: /backup/
Disallow: /files/
Disallow: /log/
Disallow: /phptmp/
Disallow: /restore/
Disallow: /html/_media/
Disallow: /html/media/images/
Disallow: /html/media/Scripting/
Disallow: /html/cgi-bin/
Disallow: /html/mediawiki/

# Allow: /html/
# Allow: /html/media/files/