Exemple robots.txt
De wikiGite
Révision datée du 29 juillet 2014 à 10:08 par Frank (discussion | contributions) (Page créée avec « Exemple de fichier robots.txt http://www.arnold-soft.de/robots.txt # BotDoku: de.wetena.com/bot # Doku: www.robotstxt.org # erlaubte robots # google.com Googleb... »)
Exemple de fichier robots.txt
http://www.arnold-soft.de/robots.txt
# BotDoku: de.wetena.com/bot # Doku: www.robotstxt.org # erlaubte robots # google.com Googlebot # bing.com bingbot # msn.com # MSIE # info@netcraft.com # Hier ein Liste der unerwuenschten robots # Yandex # www.infohelfer.de # warebay.com # thunderstone.com # pixray.com # aihit.com # ips-agent # MALC # metadatalabs.com User-agent: MLBot Disallow: / # Ahrefs.com (http://ahrefs.com/robot/) # IP 5.10.83.36 # "Mozilla/5.0 (compatible; AhrefsBot/5.0; +http://ahrefs.com/robot/)" user-agent: AhrefsBot disallow: / # ezooms.bot User-agent: Ezooms Disallow: / # domaintools.com User-agent: SurveyBot Disallow: / # www.infohelfer.de User-agent: Infohelfer Disallow: / # www.pixray.com User-agent: Pixray* Disallow: / # warebay.com User-agent: WBSearchBot Disallow: / # aihit.com User-agent: aiHitBot Disallow: / # yandex.com YandexBot YandexImages # IP 141.8.147.17 # "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)" User-agent: YandexBot Disallow: / User-agent: YandexImages Disallow: / # U User-agent: U Disallow: / # unister.de User-agent: UnisterBot Disallow: / # www.Nutch.de # IP 62.146.2.234, 117.78.13.18 # "Domnutch-Bot/Nutch-1.0 (Domnutch; http://www.Nutch.de/)" User-agent: nutch-1.4 Disallow: / User-agent: discobot Disallow: / # SEO Spider spider@spiderlytics.com # IP 5.199.136.130 # "Mozilla/5.0 (compatible; Spiderlytics/1.0; +spider@spiderlytics.com)" User-agent: Spiderlytics Disallow: / # Unknown # IP 207.241.226.239 # "ia_archiver(OS-Wayback)" User-agent: ia_archiver Disallow: / # crawler@alexa.com # IP 204.236.235.245 # "ia_archiver (+http://www.alexa.com/site/help/webmasters; crawler@alexa.com)" User-agent: alexa Disallow: / # Unknown # IP 108.59.8.70 # "Mozilla/5.0 (compatible; MJ12bot/v1.4.4; http://www.majestic12.co.uk/bot.php?+)" User-agent: MJ12bot Disallow: / # http://go.mail.ru/help/robots # IP 217.69.133.253 # "Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/2.0; +http://go.mail.ru/help/robots)" User-agent: Mail.RU_Bot Disallow: / # macinroy.com # IP 85.25.137.24 # "MacInroy Privacy Auditors. See jarnold.org's privacy violation report: http://jarnold.org.macinroy.com/jarnold.org" User-agent: MacInroy Disallow: / # www.semrush.com/bot.html # IP 46.229.164.102 # "Mozilla/5.0 (compatible; SemrushBot/0.97; +http://www.semrush.com/bot.html)" User-agent: SemrushBot Disallow: / # http://www.icjobs.de # IP 85.25.71.40 # "Mozilla/5.0 (X11; U; Linux i686; de; rv:1.9.0.1; compatible; iCjobs Stellenangebote Jobs; http://www.icjobs.de) Gecko/20100401 iCjobs/3.2.3" User-agent: iCjobs Disallow: / # http://fulltext.sblog.cz # IP 77.75.77.32 # "SeznamBot/3.0 (+http://fulltext.sblog.cz/)" User-agent: SeznamBot Disallow: / # http://webmeup-crawler.com # IP 108.178.53.146 # "Mozilla/5.0 (compatible; BLEXBot/1.0; +http://webmeup-crawler.com/)" User-agent: BLEXBot Disallow: / # http://siteexplorer.info # IP 208.43.225.84 # "Mozilla/5.0 (compatible; SiteExplorer/1.0b; +http://siteexplorer.info/)" User-agent: SiteExplorer Disallow: / # www.linkdex.com/about/bots # IP 54.242.123.170, 23.22.229.75, 54.225.52.217 23.20.126.233 # "Mozilla/5.0 (compatible; linkdexbot/2.0; +http://www.linkdex.com/about/bots/)" User-agent: linkdexbot Disallow: / # www.wotbox.com/bot # IP 81.144.138.34 # "Wotbox/2.01 (+http://www.wotbox.com/bot/)" User-agent: Wotbox Disallow: / # http://www.domaintuno.com # IP 192.96.204.42 # "http://www.domaintuno.com/whois/jarnold.org" "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)" User-agent: domaintuno Disallow: / # unknown addressendeutschland.de # IP 86.109.249.174 # "http://arnold-soft.de/" "dubaiindex (addressendeutschland.de)" User-agent: dubaiindex Disallow: / # www.pagesinvenotry.com # IP 130.185.109.243 # "PagesInventory (robot http://www.pagesinvenotry.com)" User-agent: PagesInventory Disallow: / # www.abonti.com # IP 77.233.225.115 # "Mozilla/5.0 (compatible; Abonti/0.91 - http://www.abonti.com)" User-agent: Abonti Disallow: / # www.backlinktest.com/crawler.html # IP 46.4.100.231 # "BacklinkCrawler (http://www.backlinktest.com/crawler.html)" User-agent: BacklinkCrawler Disallow: / # http://netcomber.com # IP 54.227.175.17 # "NCBot http://netcomber.com?st=ba2Tool for finding all their domain names." User-agent: NCBot Disallow: / # Unknown # IP 69.58.178.58 # "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:14.0; ips-agent) Gecko/20100101 Firefox/14.0.1" User-agent: ips-agent Disallow: / # www.grapeshot.co.uk/crawler.php # IP 89.145.95.2 # "Mozilla/5.0 (compatible; GrapeshotCrawler/2.0; +http://www.grapeshot.co.uk/crawler.php)" User-agent: GrapeshotCrawler Disallow: / # www.80legs.com/webcrawler.html # IP 64.125.222.16 # "Mozilla/5.0 (compatible; 008/0.83; http://www.80legs.com/webcrawler.html;) Gecko/2008032620" User-agent: 008/0.83 Disallow: / User-agent: 008/0.85 Disallow: / # it2media.de # IP 86.109.249.169 # "it2media-domain-crawler/1.0 on crawler-prod.it2media.de" User-agent: it2media-domain-crawler Disallow: / # http://crawler.sistrix.net # IP 176.9.148.197, IP 176.9.155.226, 5.9.112.66 # "Mozilla/5.0 (compatible; SISTRIX Crawler; http://crawler.sistrix.net/)" User-agent: SISTRIX Disallow: / # www.picsearch.com/bot.html # IP 217.212.224.183 # "psbot/0.1 (+http://www.picsearch.com/bot.html)" User-agent: psbot Disallow: / # worio.com # IP 107.22.250.59 # "Mozilla/5.0 (compatible; woriobot +http://worio.com)" User-agent: woriobot Disallow: / # semantissimo.de # IP 88.198.24.173 # "ssearch_bot (sSearch Crawler; http://www.semantissimo.de)" User-agent: sSearch Disallow: / # www.archive.org/details/archive.org_bot # IP 207.241.237.102 + .103 (abwechselnd!) + 207.241.226.234 # Mozilla/5.0 (compatible; archive.org_bot; Wayback Machine Live Record; +http://archive.org/details/archive.org_bot)" User-agent: archive.org_bot Disallow: / # +spider@waybackarchive.org # IP 5.199.136.130 # "Mozilla/5.0 (compatible; waybackarchive.org/1.0; +spider@waybackarchive.org)" User-agent: waybackarchive.org Disallow: / # www.website-datenbank.de # IP 81.209.177.145 # "netEstate NE Crawler (+http://www.website-datenbank.de/)" User-agent: netEstate Disallow: / # www.compspy.com/spider.html # IP 68.47.129.55 # "Mozilla/5.0 (compatible; CompSpyBot/1.0; +http://www.compspy.com/spider.html)" User-agent: CompSpyBot Disallow: / # www.seoprofiler.com/bot # IP 198.199.89.149, 162.243.203.202 # "Mozilla/5.0 (compatible; spbot/4.1.0; +http://OpenLinkProfiler.org/bot )" User-agent: spbot Disallow: / # http://filterdb.iss.net/crawler/ # IP 206.253.226.18 # "Mozilla/5.0 (compatible; oBot/2.3.1; http://filterdb.iss.net/crawler/)" User-agent: oBot Disallow: / # http://www.baidu.com # 183.60.243.187 # "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0" User-agent: baidu Disallow: / # http://www.exabot.com/go/robot # IP 178.255.215.69 # "Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)" User-agent: Exabot Disallow: / # www.tiscali.it # IP 217.73.208.103 # "Mozilla/5.0 (compatible; IstellaBot/1.18.81 +http://www.tiscali.it/)" User-agent: IstellaBot Disallow: / # www.netseer.com/crawler.html # IP 75.98.9.250 # "Mozilla/5.0 (compatible; NetSeer crawler/2.0; +http://www.netseer.com/crawler.html; crawler@netseer.com)" User-agent: NetSeer Disallow: / # http://www.opensiteexplorer.org/dotbot, help@moz.com # IP 208.115.113.92 # "Mozilla/5.0 (compatible; DotBot/1.1; http://www.opensiteexplorer.org/dotbot, help@moz.com)" User-agent: DotBot Disallow: / # http://www.proximic.com/info/spider.php# IP 54.211.1.18 # "Mozilla/5.0 (compatible; proximic; +http://www.proximic.com/info/spider.php)" User-agent: proximic Disallow: / # http://commoncrawl.org/faq/ # IP 54.227.12.4 # "CCBot/2.0 (http://commoncrawl.org/faq/)" User-agent: CCBot Disallow: / # # IP 130.211.186.147, 146.148.35.52 # "GET / HTTP/1.0" 200 10064 "-" "NerdyBot" User-agent: NerdyBot Disallow: / # http://semalt.semalt.com/crawler.php # IP 187.79.214.121 # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36" #User-agent: xxx #Disallow: / # # IP 69.84.207.246 # "LSSRocketCrawler/1.0 LightspeedSystems" User-agent: LSSRocketCrawler Disallow: / # ??? # 50.17.21.141 # "Cliqzbot" User-agent: Cliqzbot Disallow: / User-agent: Mediapartners-Google* Disallow: / # standard Einstellungen User-agent: * Disallow: /atd/ Disallow: /backup/ Disallow: /files/ Disallow: /log/ Disallow: /phptmp/ Disallow: /restore/ Disallow: /html/_media/ Disallow: /html/media/images/ Disallow: /html/media/Scripting/ Disallow: /html/cgi-bin/ Disallow: /html/mediawiki/ # Allow: /html/ # Allow: /html/media/files/