Changesets can be listed by changeset number.
The Git repository is here.
- Revision:
- 297
- Log:
Updated to AWStats 7.0.
- Author:
- rool
- Date:
- Fri Mar 18 13:33:29 +0000 2011
- Size:
- 81124 Bytes
- Properties:
- Property svn:executable is set
1 | # AWSTATS ROBOTS DATABASE |
2 | #------------------------------------------------------- |
3 | # If you want to add robots to extend AWStats database detection capabilities, |
4 | # you must add an entry in RobotsSearchIDOrder_listx and RobotsHashIDLib. |
5 | #------------------------------------------------------- |
6 | # $Revision: 1.62 $ - $Author: manolamancha $ - $Date: 2010/04/30 14:02:44 $ |
7 | |
8 | # 2005-08-19 Sean Carlos http://www.antezeta.com/awstats.html |
9 | # added dipsie (not tested with real data). |
10 | # added DomainsDB.net http://domainsdb.net/ |
11 | # added ia_archiver-web.archive.org (was inadvertently grouped with Alexa traffic) |
12 | # added Nutch (used by looksmart (furl?)) |
13 | # added rssImagesBot |
14 | # added Sqworm |
15 | # added t\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e |
16 | # added w3c css-validator |
17 | # added documentation link to bot home pages for above and selected major bots. |
18 | # In the case of international bots, choose .com page. |
19 | # Included tool tip (html "title"). |
20 | # To do: parameterize to match both AWStats language and tooltips settings. |
21 | # To do: add html links for all bots based on current documentation in source |
22 | # files referenced below. |
23 | # changed '\wbot[\/\-]', to '\wbot[\/\-]' (removed comma) |
24 | # made minor grammar corrections to notes below |
25 | # 2005-08-24 added YahooSeeker-Testing |
26 | # added w3c-checklink |
27 | # updated url for ask.com |
28 | # 2005-08-24 added Girafabot http://www.girafa.com/ |
29 | # 2005-08-30 added PluckFeedCrawler http://www.pluck.com/ |
30 | # added Gaisbot/3.0 (robot05@gais.cs.ccu.edu.tw; ) |
31 | # dded geniebot (wgao@genieknows.com) |
32 | # added BecomeBot link http://www.become.com/site_owners.html |
33 | # added topicblogs http://www.topicblogs.com/ |
34 | # added Powermarks; seen used by referrer spam |
35 | # added YahooSeeker |
36 | # added NG/2. http://www.exabot.com/ |
37 | # 2005-09-15 added link for Walhello appie |
38 | # added bender focused_crawler |
39 | # updated YahooSeeker description (blog crawler) |
40 | # 2005-09-16 added link for http://linkchecker.sourceforge.net |
41 | # added ConveraCrawler/0.9d ( http://www.authoritativeweb.com/crawl) |
42 | # added Blogslive info@blogslive.com intelliseek.com |
43 | # added BlogPulse (ISSpider-3.0) intelliseek.com |
44 | # 2005-09-26 added Feedfetcher-Google (http://www.google.com/feedfetcher.html) |
45 | # added EverbeeCrawler |
46 | # added Yahoo-Blogs http://help.yahoo.com/help/us/ysearch/crawling/crawling-02.html |
47 | # added link for Bloglines http://www.bloglines.com |
48 | # 2005-10-19 fixed Feedfetcher-Google (http://www.google.com/feedfetcher.html) |
49 | # added Blogshares Spiders (Synchronized V1.5.1) |
50 | # added yacy |
51 | # 2005-11-21 added Argus www.simpy.com |
52 | # added BlogsSay :: RSS Search Crawler (http://www.blogssay.com/) |
53 | # added MJ12bot http://majestic12.co.uk/bot.php |
54 | # added OpenTaggerBot (http://www.opentagger.com/opentaggerbot.htm) |
55 | # added OutfoxBot/0.3 (For internet experiments; outfox.agent@gmail.com) |
56 | # added RufusBot Rufus Web Miner http://64.124.122.252.webaroo.com/feedback.html |
57 | # added Seekbot (http://www.seekbot.net/bot.html) |
58 | # added Yahoo-MMCrawler/3.x (mms-mmcrawler-support@yahoo-inc.com) |
59 | # added link for BaiDuSpider |
60 | # added link for Blogshares Spider |
61 | # added link for StackRambler http://www.rambler.ru/doc/faq.shtml |
62 | # added link for WISENutbot |
63 | # added link for ZyBorg/1.0 (wn-14.zyborg@looksmart.net; http://www.WISEnutbot.com. Moved location to above wisenut to avoid classification as wisenut |
64 | # 2005-12-15 |
65 | # added FAST Enteprise Crawler/6 (www dot fastsearch dot com). Note spelling Enteprise not Enterprise. |
66 | # added findlinks http://wortschatz.uni-leipzig.de/findlinks/ |
67 | # added IBM Almaden Research Center WebFountainâ„¢ http://www.almaden.ibm.com/cs/crawler [hc3] |
68 | # added INFOMINE/8.0 VLCrawler (http://infomine.ucr.edu/useragents) |
69 | # added lmspider (lmspider@scansoft.com) http://www.nuance.com/ |
70 | # added noxtrumbot http://www.noxtrum.com/ |
71 | # added SandCrawler (Microsoft) |
72 | # added SBIder http://www.sitesell.com/sbider.html |
73 | # added SeznamBot http://fulltext.seznam.cz/ |
74 | # added sohu-search http://corp.sohu.com/ (looked for //robots.txt not /robots.txt) |
75 | # added the ruffle SemanticWeb crawler v0.5 - http://www.unreach.net |
76 | # added WebVulnCrawl/1.0 libwww-perl/5.803 (looked for //robots.txt not /robots.txt) |
77 | # added Yahoo! Japan keyoshid http://www.yahoo.co.jp/ |
78 | # added Y!J http://help.yahoo.co.jp/help/jp/search/indexing/indexing-15.html |
79 | # added link for GigaBot |
80 | # added link for MagpieRSS |
81 | # added link for MSIECrawler |
82 | # 2005-12-21 |
83 | # added aipbot http://www.aipbot.com aipbot@aipbot.com [matthys70 users.sourceforge.net] |
84 | # added Everest-Vulcan Inc./0.1 (R&D project; http://everest.vulcan.com/crawlerhelp) |
85 | # added Fast-Search-Engine http://www.fast-search-engine.com/ [matthys70 users.sourceforge.net] |
86 | # added g2Crawler (nobody@airmail.net) http://crawler.instantnetworks.net/ |
87 | # added Jakarta commons-httpclient http://jakarta.apache.org/commons/httpclient/ (hit robots.txt). May be used as robot or browser - a site may want to remove this entry. |
88 | # added OmniExplorer_Bot http://www.omni-explorer.com/ [matthys70 users.sourceforge.net] |
89 | # added USTC-Semantic-Group ai.ustc.edu.cn/mas/en/research/index.php ? |
90 | # 2005-12-22 |
91 | # added EARTHCOM.info www.earthcom.info |
92 | # added HTTrack off-line browser 'httrack','HTTrack', http://www.httrack.com/ [Moizes Gabor] |
93 | # added KummHttp http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_g_l_301105_2\b [Moizes Gabor] |
94 | # 2006-01-01 |
95 | # added Dulance http://www.dulance.com/bot.jsp |
96 | # added MojeekBot http://www.mojeek.com/bot.html |
97 | # added nicebot http://www.egghelp.org/setup.htm ? |
98 | # added Snappy http://www.urltrends.com/faq.php |
99 | # added sohu agent |
100 | # added VORTEX http://marty.anstey.ca/robots/vortex/ [matthys70 users.sourceforge.net] |
101 | # added zspider http://feedback.redkolibri.com/ |
102 | # 2006-01-13 |
103 | # added boitho.com-dc http://www.boitho.com/dcbot.html |
104 | # added IRLbot http://irl.cs.tamu.edu/crawler |
105 | # added virus_detector virus_harvester@securecomputing.com |
106 | # added Wavefire http://www.wavefire.com; info@wavefire.com |
107 | # added WebFilter Robot |
108 | # 2006-01-24 |
109 | # added Shim-Crawler http://www.logos.ic.i.u-tokyo.ac.jp/crawler/; crawl@logos.ic.i.u-tokyo.ac.jp |
110 | # added Exabot exabot.com |
111 | # added LetsCrawl.com http://letscrawl.com |
112 | # added ichiro http://help.goo.ne.jp/door/crawlerE.html |
113 | # 2006-01-27 additional 22 robots from a list provided by Moizes Gabor |
114 | # added ALeadSoftbot http://www.aleadsoft.com/bot.htm |
115 | # added CipinetBot http://www.cipinet.com/bot.html |
116 | # added Cuasarbot http://www.cuasar.com/ |
117 | # added Dumbot http://www.dumbfind.com/ |
118 | # added Extreme_Picture_Finder http://www.exisoftware.com/ |
119 | # added Fooky.com/ScorpionBot/ScoutOut http://www.fooky.com/scorpionbots |
120 | # added IlTrovatore-Setaccio http://www.iltrovatore.it/aiuto/motore_di_ricerca.html bot@iltrovatore.it |
121 | # added InsurancoBot http://www.fastspywareremoval.com/ |
122 | # added InternetArchive http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org |
123 | # added KazoomBot http://www.kazoom.ca/bot.html kazoombot@kazoom.ca |
124 | # added Kurzor http://www.easymail.hu/ cursor@easymail.hu |
125 | # added NutchCVS http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org |
126 | # added NutchOSU-VLIB http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org |
127 | # added Orbiter http://www.dailyorbit.com/bot.htm |
128 | # added PHP_version_tracker http://www.nexen.net/phpversion/bot.php |
129 | # added SuperBot http://www.sparkleware.com/superbot/ |
130 | # added SynooBot http://www.synoo.de/bot.html webmaster@synoo.com |
131 | # added TestBot http://www.agbrain.com/ |
132 | # added TutorGigBot http://www.tutorgig.info/ |
133 | # added WebIndexer mailto://webindexerv1@yahoo.com |
134 | # added WebMiner http://64.124.122.252/feedback.html |
135 | # 2006-02-01 |
136 | # added heritrix https://sourceforge.net/forum/message.php?msg_id=3550202 |
137 | # added Zeus Webster Pro https://sourceforge.net/forum/message.php?msg_id=3141164 |
138 | # additional robots from a list provided by Moizes Gabor [ mojzi -a-t- free mail hu ] |
139 | # added Candlelight_Favorites_Inspector |
140 | # added DomainChecker |
141 | # added EasyDL |
142 | # added FavOrg |
143 | # added Favorites_Sweeper |
144 | # added Html_Link_Validator |
145 | # added Internet_Ninja |
146 | # added JRTwine_Software_Check_Favorites_Utility |
147 | # fixed Microsoft_URL_Control |
148 | # added miniRank |
149 | # added Missigua_Locator |
150 | # added NPBot |
151 | # added Ocelli |
152 | # added Onet.pl_SA |
153 | # added proodleBot |
154 | # added SearchGuild_DMOZ_Experiment |
155 | # added Susie |
156 | # added Website_Monitoring_Bot |
157 | # added Xenu_Link_Sleuth |
158 | # 2006-05-15 |
159 | # added ASPseek http://www.aspseek.org/ |
160 | # added AdamM Bot http://home.blic.net/adamm/ |
161 | # added archive.org_bot http://crawls.archive.org/collections/bncf/crawl.html |
162 | # added arianna.libero.it (Italian Portal/search engine) |
163 | # added Biz360 spider http://www.biz360.com |
164 | # added BlogBridge Service http://www.blogbridge.com/ |
165 | # added BlogSearch http://www.icerocket.com/ |
166 | # added libcrawl |
167 | # added edgeio-relanshanbottriever http://www.edgeio.com |
168 | # added FeedFlow http://feedflow.com/about |
169 | # added Biblioteca Nazionale Centrale di Firenze (Italian National Archive) http://www.bncf.firenze.sbn.it/raccolta.txt |
170 | # added Java catchall - used by many spam bots |
171 | # added lanshanbot http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_g_l_140406_1%5Cb |
172 | # added msnbot-media http://search.msn.com/msnbot.htm |
173 | # added MT::Telegraph::Agent |
174 | # added Netluchs http://www.netluchs.de/ (German SE bot) |
175 | # added oBot http://www.webmasterworld.com/forum11/1616.htm |
176 | # added Onfolio http://www.onfolio.com/ (IE Toolbar plugin) - hit rss feeds. |
177 | # added ping.blo.gs http://blo.gs/ping.php blog bot |
178 | # added Sphere Scout http://www.sphere.com/ |
179 | # added sproose crawler http://www.sproose.com/bot.html |
180 | # added SyndicAPI http://syndicapi.com/bot.html |
181 | # added Yahoo! Mindset http://mindset.research.yahoo.com/ |
182 | # added msrabot |
183 | # added Vagabondo & Vagabondo-WAP http://www.wise-guys.nl/Contact/index.php?botselected=webagents&lang=uk |
184 | # fixed Missigua Locator detection (Missigua_Locator -> Missigua Locator) |
185 | # changed echo to echo! to avoid conflict with the bonecho (Firefox 2.0) browser. |
186 | # This requires you to reprocess historic logs if you want EchO! to be recognized for older reports. |
187 | # 2006-05-17 |
188 | # added Alpha Search Agent # 62.152.125.60 Eurologon Srl |
189 | # added Krugle http://www.krugle.com/crawler/info.html the search engine for developers |
190 | # added Octora Beta Bot http://www.octora.com/ # Blog and Rss Search Engine |
191 | # added UbiCrawler http://law.dsi.unimi.it/ubicrawler/ |
192 | # added Yahoo! Slurp China http://misc.yahoo.com.cn/help.html |
193 | # You must reprocess old logs for the Yahoo! Slurp China bot to be detected in old reports |
194 | # 2006-05-20 |
195 | # added 1-More Scanner http://www.myzips.com/software/1-More-Scanner.phtml |
196 | # added Accoona-AI-Agent http://www.accoona.com/ |
197 | # added ActiveBookmark http://www.libmaster.com/active_bookmark.php |
198 | # added BIGLOTRON http://www.biglotron.com/robot.html |
199 | # added Bookmark-Manager http://bkm.sourceforge.net/ |
200 | # added cbn00glebot |
201 | # added Cerberian Drtrs http://www.pgts.com.au/cgi-bin/psql?robot_info=25240 |
202 | # added CFNetwork http://www.cocoadev.com/index.pl?CFNetwork |
203 | # added CheckWeb link validator http://p.duby.free.fr/chkweb.htm |
204 | # added Computer and Automation Research Institute Crawler http://www.ilab.sztaki.hu/~stamas/publications/p184-benczur.html |
205 | # added ConveraCrawler http://www.authoritativeweb.com/crawl/ |
206 | # added ConveraMultiMediaCrawler http://www.authoritativeweb.com/crawl/ |
207 | # added CSE HTML Validator Lite Online http://online.htmlvalidator.com/php/onlinevallite.php |
208 | # added Cursor http://adcenter.hu/docs/en/bot.html |
209 | # added Custo http://www.netwu.com/custo/ |
210 | # added DataFountains/DMOZ Downloader http://infomine.ucr.edu/ |
211 | # added Deepindex http://www.deepindex.net/faq.php |
212 | # added DNSGroup http://www.dnsgroup.com/ |
213 | # added DoCoMo http://www.nttdocomo.co.jp/ |
214 | # added dumm.de-Bot http://www.dumm.de/ |
215 | # added ETS v http://www.freetranslation.com/help/ |
216 | # added eventax http://www.eventax.de/ |
217 | # added FAST Enterprise Crawler * crawleradmin.t-info@telekom.de http://www.telekom.de/ |
218 | # added FAST Enterprise Crawler http://www.fast.no/ |
219 | # added FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de http://www.telekom.de/ |
220 | # added FeedValidator http://feedvalidator.org/ |
221 | # added FilmkameraBot http://www.filmkamera.at/bot.html |
222 | # added Findexa Crawler http://www.findexa.no/gulesider/article26548.ece |
223 | # added Global Fetch http://www.wesonet.com/ |
224 | # added GOFORITBOT http://www.goforit.com/about/ |
225 | # added GoForIt.com http://www.goforit.com/about/ |
226 | # added GPU p2p crawler http://gpu.sourceforge.net/search_engine.php |
227 | # added HooWWWer http://cosco.hiit.fi/search/hoowwwer/ |
228 | # added HPPrint |
229 | # added HTMLParser http://htmlparser.sourceforge.net/ |
230 | # added Hundesuche.com-Bot http://www.hundesuche.com/ |
231 | # added InfoBot http://www.infobot.org/ |
232 | # added InfociousBot http://corp.infocious.com/tech_crawler.php |
233 | # added InternetSupervision http://internetsupervision.com/ |
234 | # added isearch2006 http://www.yahoo.com.cn/ |
235 | # added IUPUI_Research_Bot http://spamhuntress.com/2005/04/25/a-mail-harvester-visits/ |
236 | # added KalamBot http://64.124.122.251/feedback.html |
237 | # added kamano.de NewsFeedVerzeichnis http://www.kamano.de/ |
238 | # added Kevin http://dznet.com/kevin/ |
239 | # added KnowItAll http://www.cs.washington.edu/research/knowitall/ |
240 | # added Knowledge.com http://www.knowledge.com/ |
241 | # added Kouaa Krawler http://www.kouaa.com/ |
242 | # added ksibot http://ego.ms.mff.cuni.cz/ |
243 | # added Link Valet Online http://www.htmlhelp.com/tools/valet/ |
244 | # added lwp-request http://search.cpan.org/~gaas/libwww-perl-5.69/bin/lwp-request |
245 | # added lwp-trivial http://search.cpan.org/src/GAAS/libwww-perl-5.805/lib/LWP/Simple.pm |
246 | # added MapoftheInternet.com http://MapoftheInternet.com/ |
247 | # added Matrix S.p.A. - FAST Enterprise Crawler http://tin.virgilio.it/ |
248 | # added Megite http://www.megite.com/ |
249 | # added Metaspinner http://index.meta-spinner.de/ |
250 | # added Mini-reptile |
251 | # added Misterbot http://www.misterbot.fr/ |
252 | # added Miva http://www.miva.com/ |
253 | # added Mizzu Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_m_141105_2\b |
254 | # added MSRBOT http://research.microsoft.com/research/sv/msrbot/ |
255 | # added MS SharePoint Portal Server - MS Search 4.0 Robot http://support.microsoft.com/default.aspx?scid=kb;en-us;284022 |
256 | # added Mydoyouhike http://www.doyouhike.net/my |
257 | # added NASA Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_140506_2\b |
258 | # added NetSprint http://www.netsprint.pl/serwis/ |
259 | # added NimbleCrawler http://www.healthline.com/ |
260 | # added OpenWebSpider http://www.openwebspider.org/ |
261 | # added Oracle Ultra Search http://www.oracle.com/technology/products/ultrasearch/index.html |
262 | # added OSSProxy http://www.marketscore.com/FAQ.Aspx |
263 | # added passwordmaker.org http://passwordmaker.org/ |
264 | # added PEAR HTTP Request class http://pear.php.net/ |
265 | # added PEERbot http://www.peerbot.com/ |
266 | # added PHP version tracker http://www.nexen.net/phpversion/bot.php |
267 | # added PictureOfInternet http://malfunction.org/poi/ |
268 | # added plinki http://www.plinki.com/ |
269 | # added Port Huron Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1133\b |
270 | # added PostFavorites http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1135\b |
271 | # added ProjectWF-java-test-crawler |
272 | # added PyQuery http://sourceforge.net/projects/pyquery/ |
273 | # added Schizozilla http://spamhuntress.com/2005/03/18/gizmo/ |
274 | # added Scumbot |
275 | # added Sensis Web Crawler http://www.sensis.com.au/ |
276 | # added snap.com beta crawler http://www.snap.com/ |
277 | # added Steeler http://www.tkl.iis.u-tokyo.ac.jp/~crawler/ |
278 | # added STEROID Download http://faqs.org.ru/progr/pascal/delphi_internet2.htm |
279 | # added Suchfin-Bot http://www.suchfin.de/ |
280 | # added Sunrise http://www.sunrisexp.com/ |
281 | # added Tagyu Agent http://www.tagyu.com/ |
282 | # added Tcl http client package http://www.tcl.tk/man/tcl8.4/TclCmd/http.htm |
283 | # added TeragramCrawlerSURF http://www.teragram.com/ |
284 | # added Test Crawler http://netp.ath.cx/ |
285 | # added UnChaos Bot Hybrid Web Search Engine http://www.unchaos.com/ |
286 | # added unido-bot http://www.unchina.org/unido/unido/our_projects/3_3.html |
287 | # added UniversalFeedParser http://feedparser.org/ (seen from md301000.inktomisearch.com) |
288 | # added updated http://www.updated.com/ |
289 | # added Vermut http://vermut.aol.com |
290 | # added versus crawler from eda.baykan@epfl.ch http://www.epfl.ch/Eindex.html |
291 | # added Vespa Crawler (Yahoo Norway?) http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_t_z_030406_1%5Cb |
292 | # added VSE http://www.vivisimo.com/ |
293 | # added webcrawl.net http://www.webcrawl.net/ |
294 | # added Web Downloader http://www.krasu.ru/soft/chuchelo/ |
295 | # added Webdup http://www.webdup.com/en/index.html |
296 | # added Wells Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_t_z_1484\b |
297 | # added WordPress http://wordpress.org/ |
298 | # added wume crawler http://wume.cse.lehigh.edu/~xiq204/crawler/ |
299 | # added Xenu's Link Sleuth (with ') |
300 | # added xirq http://www.xirq.com/ |
301 | # added yoogliFetchAgent http://www.yoogli.com/ |
302 | # added Z-Add Link Checker http://w3.z-add.co.uk/linkcheck/ |
303 | # -- fix - some robots were reported with _ where _ should have been a space. |
304 | # changed Xenu Link Sleuth |
305 | # changed microsoft[_+ ]url[_+ ]control -> microsoft_url_control |
306 | # changed favorites_sweeper -> favorites_sweeper |
307 | # -- updates |
308 | # updated AskJeeves to Ask |
309 | |
310 | # to do MS Search 4.0 Robot |
311 | |
312 | #package AWSROB; |
313 | |
314 | |
315 | # Robots list was found at http://www.robotstxt.org/wc/active/all.txt |
316 | # Other robots can be found at http://www.jafsoft.com/searchengines/webbots.html |
317 | # Rem: To avoid bad detection, some robot's ids were removed from this list: |
318 | # - Robots with ID of 3 letters only |
319 | # - Robots called 'webs' and 'tcl' |
320 | # Rem: directhit changed into direct_hit (its real id) |
321 | # Rem: calif changed into calif[^r] to avoid confusion between Tiscalifreenet browser |
322 | # Rem: fish changed into [^a]fish to avoid confusion between Madsafish browser |
323 | # Rem: roadrunner changed into road_runner |
324 | # Rem: lycos changed to lycos_ to avoid confusion with lycos-online browser |
325 | # Rem: voyager changed into ^voyager\/ to avoid to exclude voyager and amigavoyager browser |
326 | |
327 | # RobotsSearchIDOrder |
328 | # It contains all matching criteria to search for in log fields. This list is |
329 | # used to know in which order to search Robot IDs. |
330 | # Most frequent ones are in list1, used when LevelForRobotsDetection is 1 or more |
331 | # Minor robots are in list2, used when LevelForRobotsDetection is 2 or more |
332 | # Note: Robots IDs are in lower case, '_', ' ' and '+' are changed into '[_+ ]' and are quoted. |
333 | #------------------------------------------------------- |
334 | @RobotsSearchIDOrder_list1 = ( |
335 | # Common robots (In robot file) |
336 | 'appie', |
337 | 'architext', |
338 | 'jeeves', |
339 | 'bjaaland', |
340 | 'contentmatch', |
341 | 'ferret', |
342 | 'googlebot', |
343 | 'google\-sitemaps', |
344 | 'gulliver', |
345 | 'virus[_+ ]detector', # Must be before harvest |
346 | 'harvest', |
347 | 'htdig', |
348 | 'linkwalker', |
349 | 'lilina', |
350 | 'lycos[_+ ]', |
351 | 'moget', |
352 | 'muscatferret', |
353 | 'myweb', |
354 | 'nomad', |
355 | 'scooter', |
356 | 'slurp', |
357 | '^voyager\/', |
358 | 'weblayers', |
359 | # Common robots (Not in robot file) |
360 | 'antibot', |
361 | 'bruinbot', |
362 | 'digout4u', |
363 | 'echo!', |
364 | 'fast\-webcrawler', |
365 | 'ia_archiver\-web\.archive\.org', # Must be before ia_archiver to avoid confusion with alexa |
366 | 'ia_archiver', |
367 | 'jennybot', |
368 | 'mercator', |
369 | 'netcraft', |
370 | 'msnbot\-media', |
371 | 'msnbot', |
372 | 'petersnews', |
373 | 'relevantnoise\.com', |
374 | 'unlost_web_crawler', |
375 | 'voila', |
376 | 'webbase', |
377 | 'webcollage', |
378 | 'cfetch', |
379 | 'zyborg', # Must be before wisenut |
380 | 'wisenutbot' |
381 | ); |
382 | @RobotsSearchIDOrder_list2 = ( |
383 | # Less common robots (In robot file) |
384 | '[^a]fish', |
385 | 'abcdatos', |
386 | 'acme\.spider', |
387 | 'ahoythehomepagefinder', |
388 | 'alkaline', |
389 | 'anthill', |
390 | 'arachnophilia', |
391 | 'arale', |
392 | 'araneo', |
393 | 'aretha', |
394 | 'ariadne', |
395 | 'powermarks', |
396 | 'arks', |
397 | 'aspider', |
398 | 'atn\.txt', |
399 | 'atomz', |
400 | 'auresys', |
401 | 'backrub', |
402 | 'bbot', |
403 | 'bigbrother', |
404 | 'blackwidow', |
405 | 'blindekuh', |
406 | 'bloodhound', |
407 | 'borg\-bot', |
408 | 'brightnet', |
409 | 'bspider', |
410 | 'cactvschemistryspider', |
411 | 'calif[^r]', |
412 | 'cassandra', |
413 | 'cgireader', |
414 | 'checkbot', |
415 | 'christcrawler', |
416 | 'churl', |
417 | 'cienciaficcion', |
418 | 'collective', |
419 | 'combine', |
420 | 'conceptbot', |
421 | 'coolbot', |
422 | 'core', |
423 | 'cosmos', |
424 | 'cruiser', |
425 | 'cusco', |
426 | 'cyberspyder', |
427 | 'desertrealm', |
428 | 'deweb', |
429 | 'dienstspider', |
430 | 'digger', |
431 | 'diibot', |
432 | 'direct_hit', |
433 | 'dnabot', |
434 | 'download_express', |
435 | 'dragonbot', |
436 | 'dwcp', |
437 | 'e\-collector', |
438 | 'ebiness', |
439 | 'elfinbot', |
440 | 'emacs', |
441 | 'emcspider', |
442 | 'esther', |
443 | 'evliyacelebi', |
444 | 'fastcrawler', |
445 | 'feedcrawl', |
446 | 'fdse', |
447 | 'felix', |
448 | 'fetchrover', |
449 | 'fido', |
450 | 'finnish', |
451 | 'fireball', |
452 | 'fouineur', |
453 | 'francoroute', |
454 | 'freecrawl', |
455 | 'funnelweb', |
456 | 'gama', |
457 | 'gazz', |
458 | 'gcreep', |
459 | 'getbot', |
460 | 'geturl', |
461 | 'golem', |
462 | 'gougou', |
463 | 'grapnel', |
464 | 'griffon', |
465 | 'gromit', |
466 | 'gulperbot', |
467 | 'hambot', |
468 | 'havindex', |
469 | 'hometown', |
470 | 'htmlgobble', |
471 | 'hyperdecontextualizer', |
472 | 'iajabot', |
473 | 'iaskspider', |
474 | 'hl_ftien_spider', |
475 | 'sogou', |
476 | 'iconoclast', |
477 | 'ilse', |
478 | 'imagelock', |
479 | 'incywincy', |
480 | 'informant', |
481 | 'infoseek', |
482 | 'infoseeksidewinder', |
483 | 'infospider', |
484 | 'inspectorwww', |
485 | 'intelliagent', |
486 | 'irobot', |
487 | 'iron33', |
488 | 'israelisearch', |
489 | 'javabee', |
490 | 'jbot', |
491 | 'jcrawler', |
492 | 'jobo', |
493 | 'jobot', |
494 | 'joebot', |
495 | 'jubii', |
496 | 'jumpstation', |
497 | 'kapsi', |
498 | 'katipo', |
499 | 'kilroy', |
500 | 'ko[_+ ]yappo[_+ ]robot', |
501 | 'kummhttp', |
502 | 'labelgrabber\.txt', |
503 | 'larbin', |
504 | 'legs', |
505 | 'linkidator', |
506 | 'linkscan', |
507 | 'lockon', |
508 | 'logo_gif', |
509 | 'macworm', |
510 | 'magpie', |
511 | 'marvin', |
512 | 'mattie', |
513 | 'mediafox', |
514 | 'merzscope', |
515 | 'meshexplorer', |
516 | 'mindcrawler', |
517 | 'mnogosearch', |
518 | 'momspider', |
519 | 'monster', |
520 | 'motor', |
521 | 'muncher', |
522 | 'mwdsearch', |
523 | 'ndspider', |
524 | 'nederland\.zoek', |
525 | 'netcarta', |
526 | 'netmechanic', |
527 | 'netscoop', |
528 | 'newscan\-online', |
529 | 'nhse', |
530 | 'northstar', |
531 | 'nzexplorer', |
532 | 'objectssearch', |
533 | 'occam', |
534 | 'octopus', |
535 | 'openfind', |
536 | 'orb_search', |
537 | 'packrat', |
538 | 'pageboy', |
539 | 'parasite', |
540 | 'patric', |
541 | 'pegasus', |
542 | 'perignator', |
543 | 'perlcrawler', |
544 | 'phantom', |
545 | 'phpdig', |
546 | 'piltdownman', |
547 | 'pimptrain', |
548 | 'pioneer', |
549 | 'pitkow', |
550 | 'pjspider', |
551 | 'plumtreewebaccessor', |
552 | 'poppi', |
553 | 'portalb', |
554 | 'psbot', |
555 | 'python', |
556 | 'raven', |
557 | 'rbse', |
558 | 'resumerobot', |
559 | 'rhcs', |
560 | 'road_runner', |
561 | 'robbie', |
562 | 'robi', |
563 | 'robocrawl', |
564 | 'robofox', |
565 | 'robozilla', |
566 | 'roverbot', |
567 | 'rules', |
568 | 'safetynetrobot', |
569 | 'search\-info', |
570 | 'search_au', |
571 | 'searchprocess', |
572 | 'senrigan', |
573 | 'sgscout', |
574 | 'shaggy', |
575 | 'shaihulud', |
576 | 'sift', |
577 | 'simbot', |
578 | 'site\-valet', |
579 | 'sitetech', |
580 | 'skymob', |
581 | 'slcrawler', |
582 | 'smartspider', |
583 | 'snooper', |
584 | 'solbot', |
585 | 'speedy', |
586 | 'spider[_+ ]monkey', |
587 | 'spiderbot', |
588 | 'spiderline', |
589 | 'spiderman', |
590 | 'spiderview', |
591 | 'spry', |
592 | 'sqworm', |
593 | 'ssearcher', |
594 | 'suke', |
595 | 'sunrise', |
596 | 'suntek', |
597 | 'sven', |
598 | 'tach_bw', |
599 | 'tagyu_agent', |
600 | 'tailrank', |
601 | 'tarantula', |
602 | 'tarspider', |
603 | 'techbot', |
604 | 'templeton', |
605 | 'titan', |
606 | 'titin', |
607 | 'tkwww', |
608 | 'tlspider', |
609 | 'ucsd', |
610 | 'udmsearch', |
611 | 'universalfeedparser', |
612 | 'urlck', |
613 | 'valkyrie', |
614 | 'verticrawl', |
615 | 'victoria', |
616 | 'visionsearch', |
617 | 'voidbot', |
618 | 'vwbot', |
619 | 'w3index', |
620 | 'w3m2', |
621 | 'wallpaper', |
622 | 'wanderer', |
623 | 'wapspIRLider', |
624 | 'webbandit', |
625 | 'webcatcher', |
626 | 'webcopy', |
627 | 'webfetcher', |
628 | 'webfoot', |
629 | 'webinator', |
630 | 'weblinker', |
631 | 'webmirror', |
632 | 'webmoose', |
633 | 'webquest', |
634 | 'webreader', |
635 | 'webreaper', |
636 | 'websnarf', |
637 | 'webspider', |
638 | 'webvac', |
639 | 'webwalk', |
640 | 'webwalker', |
641 | 'webwatch', |
642 | 'whatuseek', |
643 | 'whowhere', |
644 | 'wired\-digital', |
645 | 'wmir', |
646 | 'wolp', |
647 | 'wombat', |
648 | 'wordpress', |
649 | 'worm', |
650 | 'woozweb', |
651 | 'wwwc', |
652 | 'wz101', |
653 | 'xget', |
654 | # Other robots reported by users |
655 | '1\-more_scanner', |
656 | 'accoona\-ai\-agent', |
657 | 'activebookmark', |
658 | 'adamm_bot', |
659 | 'almaden', |
660 | 'aipbot', |
661 | 'aleadsoftbot', |
662 | 'alpha_search_agent', |
663 | 'allrati', |
664 | 'aport', |
665 | 'archive\.org_bot', |
666 | 'argus', # Must be before nutch |
667 | 'arianna\.libero\.it', |
668 | 'aspseek', |
669 | 'asterias', |
670 | 'awbot', |
671 | 'baiduspider', |
672 | 'becomebot', |
673 | 'bender', |
674 | 'betabot', |
675 | 'biglotron', |
676 | 'bittorrent_bot', |
677 | 'biz360[_+ ]spider', |
678 | 'blogbridge[_+ ]service', |
679 | 'bloglines', |
680 | 'blogpulse', |
681 | 'blogsearch', |
682 | 'blogshares', |
683 | 'blogslive', |
684 | 'blogssay', |
685 | 'bncf\.firenze\.sbn\.it\/raccolta\.txt', |
686 | 'bobby', |
687 | 'boitho\.com\-dc', |
688 | 'bookmark\-manager', |
689 | 'boris', |
690 | 'bumblebee', |
691 | 'candlelight[_+ ]favorites[_+ ]inspector', |
692 | 'cbn00glebot', |
693 | 'cerberian_drtrs', |
694 | 'cfnetwork', |
695 | 'cipinetbot', |
696 | 'checkweb_link_validator', |
697 | 'commons\-httpclient', |
698 | 'computer_and_automation_research_institute_crawler', |
699 | 'converamultimediacrawler', |
700 | 'converacrawler', |
701 | 'cscrawler', |
702 | 'cse_html_validator_lite_online', |
703 | 'cuasarbot', |
704 | 'cursor', |
705 | 'custo', |
706 | 'datafountains\/dmoz_downloader', |
707 | 'daviesbot', |
708 | 'daypopbot', |
709 | 'deepindex', |
710 | 'dipsie\.bot', |
711 | 'dnsgroup', |
712 | 'domainchecker', |
713 | 'domainsdb\.net', |
714 | 'dulance', |
715 | 'dumbot', |
716 | 'dumm\.de\-bot', |
717 | 'earthcom\.info', |
718 | 'easydl', |
719 | 'edgeio\-retriever', |
720 | 'ets_v', |
721 | 'exactseek', |
722 | 'extreme[_+ ]picture[_+ ]finder', |
723 | 'eventax', |
724 | 'everbeecrawler', |
725 | 'everest\-vulcan', |
726 | 'ezresult', |
727 | 'enteprise', |
728 | 'facebook', |
729 | 'fast_enterprise_crawler.*crawleradmin\.t\-info@telekom\.de', |
730 | 'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de', |
731 | 'matrix_s\.p\.a\._\-_fast_enterprise_crawler', # must come before fast enterprise crawler |
732 | 'fast_enterprise_crawler', |
733 | 'fast\-search\-engine', |
734 | 'favicon', |
735 | 'favorg', |
736 | 'favorites_sweeper', |
737 | 'feedburner', |
738 | 'feedfetcher\-google', |
739 | 'feedflow', |
740 | 'feedster', |
741 | 'feedsky', |
742 | 'feedvalidator', |
743 | 'filmkamerabot', |
744 | 'findlinks', |
745 | 'findexa_crawler', |
746 | 'fooky\.com\/ScorpionBot', |
747 | 'g2crawler', |
748 | 'gaisbot', |
749 | 'geniebot', |
750 | 'gigabot', |
751 | 'girafabot', |
752 | 'global_fetch', |
753 | 'gnodspider', |
754 | 'goforit\.com', |
755 | 'goforitbot', |
756 | 'gonzo', |
757 | 'grub', |
758 | 'gpu_p2p_crawler', |
759 | 'henrythemiragorobot', |
760 | 'heritrix', |
761 | 'holmes', |
762 | 'hoowwwer', |
763 | 'hpprint', |
764 | 'htmlparser', |
765 | 'html[_+ ]link[_+ ]validator', |
766 | 'httrack', |
767 | 'hundesuche\.com\-bot', |
768 | 'ichiro', |
769 | 'iltrovatore\-setaccio', |
770 | 'infobot', |
771 | 'infociousbot', |
772 | 'infomine', |
773 | 'insurancobot', |
774 | 'internet[_+ ]ninja', |
775 | 'internetarchive', |
776 | 'internetseer', |
777 | 'internetsupervision', |
778 | 'irlbot', |
779 | 'isearch2006', |
780 | 'iupui_research_bot', |
781 | 'jrtwine[_+ ]software[_+ ]check[_+ ]favorites[_+ ]utility', |
782 | 'justview', |
783 | 'kalambot', |
784 | 'kamano\.de_newsfeedverzeichnis', |
785 | 'kazoombot', |
786 | 'kevin', |
787 | 'keyoshid', # Must come before Y!J |
788 | 'kinjabot', |
789 | 'kinja\-imagebot', |
790 | 'knowitall', |
791 | 'knowledge\.com', |
792 | 'kouaa_krawler', |
793 | 'krugle', |
794 | 'ksibot', |
795 | 'kurzor', |
796 | 'lanshanbot', |
797 | 'letscrawl\.com', |
798 | 'libcrawl', |
799 | 'linkbot', |
800 | 'link_valet_online', |
801 | 'metager\-linkchecker', # Must be before linkchecker |
802 | 'linkchecker', |
803 | 'livejournal\.com', |
804 | 'lmspider', |
805 | 'lwp\-request', |
806 | 'lwp\-trivial', |
807 | 'magpierss', |
808 | 'mail\.ru', |
809 | 'mapoftheinternet\.com', |
810 | 'mediapartners\-google', |
811 | 'megite', |
812 | 'metaspinner', |
813 | 'microsoft[_+ ]url[_+ ]control', |
814 | 'mini\-reptile', |
815 | 'minirank', |
816 | 'missigua_locator', |
817 | 'misterbot', |
818 | 'miva', |
819 | 'mizzu_labs', |
820 | 'mj12bot', |
821 | 'mojeekbot', |
822 | 'msiecrawler', |
823 | 'ms_search_4\.0_robot', |
824 | 'msrabot', |
825 | 'msrbot', |
826 | 'mt::telegraph::agent', |
827 | 'nagios', |
828 | 'nasa_search', |
829 | 'mydoyouhike', |
830 | 'netluchs', |
831 | 'netsprint', |
832 | 'newsgatoronline', |
833 | 'nicebot', |
834 | 'nimblecrawler', |
835 | 'noxtrumbot', |
836 | 'npbot', |
837 | 'nutchcvs', |
838 | 'nutchosu\-vlib', |
839 | 'nutch', # Must come after other nutch versions |
840 | 'ocelli', |
841 | 'octora_beta_bot', |
842 | 'omniexplorer[_+ ]bot', |
843 | 'onet\.pl[_+ ]sa', |
844 | 'onfolio', |
845 | 'opentaggerbot', |
846 | 'openwebspider', |
847 | 'oracle_ultra_search', |
848 | 'orbiter', |
849 | 'yodaobot', |
850 | 'qihoobot', |
851 | 'passwordmaker\.org', |
852 | 'pear_http_request_class', |
853 | 'peerbot', |
854 | 'perman', |
855 | 'php[_+ ]version[_+ ]tracker', |
856 | 'pictureofinternet', |
857 | 'ping\.blo\.gs', |
858 | 'plinki', |
859 | 'pluckfeedcrawler', |
860 | 'pogodak', |
861 | 'pompos', |
862 | 'popdexter', |
863 | 'port_huron_labs', |
864 | 'postfavorites', |
865 | 'projectwf\-java\-test\-crawler', |
866 | 'proodlebot', |
867 | 'pyquery', |
868 | 'rambler', |
869 | 'redalert', |
870 | 'rojo', |
871 | 'rssimagesbot', |
872 | 'ruffle', |
873 | 'rufusbot', |
874 | 'sandcrawler', |
875 | 'sbider', |
876 | 'schizozilla', |
877 | 'scumbot', |
878 | 'searchguild[_+ ]dmoz[_+ ]experiment', |
879 | 'seekbot', |
880 | 'sensis_web_crawler', |
881 | 'seznambot', |
882 | 'shim\-crawler', |
883 | 'shoutcast', |
884 | 'slysearch', |
885 | 'snap\.com_beta_crawler', |
886 | 'sohu\-search', |
887 | 'sohu', # "sohu agent" |
888 | 'snappy', |
889 | 'sphere_scout', |
890 | 'spip', |
891 | 'sproose_crawler', |
892 | 'steeler', |
893 | 'steroid__download', |
894 | 'suchfin\-bot', |
895 | 'superbot', |
896 | 'surveybot', |
897 | 'susie', |
898 | 'syndic8', |
899 | 'syndicapi', |
900 | 'synoobot', |
901 | 'tcl_http_client_package', |
902 | 'technoratibot', |
903 | 'teragramcrawlersurf', |
904 | 'test_crawler', |
905 | 'testbot', |
906 | 't\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e', |
907 | 'topicblogs', |
908 | 'turnitinbot', |
909 | 'turtlescanner', # Must be before turtle |
910 | 'turtle', |
911 | 'tutorgigbot', |
912 | 'twiceler', |
913 | 'ubicrawler', |
914 | 'ultraseek', |
915 | 'unchaos_bot_hybrid_web_search_engine', |
916 | 'unido\-bot', |
917 | 'updated', |
918 | 'ustc\-semantic\-group', |
919 | 'vagabondo\-wap', |
920 | 'vagabondo', |
921 | 'vermut', |
922 | 'versus_crawler_from_eda\.baykan@epfl\.ch', |
923 | 'vespa_crawler', |
924 | 'vortex', |
925 | 'vse\/', |
926 | 'w3c\-checklink', |
927 | 'w3c[_+ ]css[_+ ]validator[_+ ]jfouffa', |
928 | 'w3c_validator', |
929 | 'watchmouse', |
930 | 'wavefire', |
931 | 'webclipping\.com', |
932 | 'webcompass', |
933 | 'webcrawl\.net', |
934 | 'web_downloader', |
935 | 'webdup', |
936 | 'webfilter', |
937 | 'webindexer', |
938 | 'webminer', |
939 | 'website[_+ ]monitoring[_+ ]bot', |
940 | 'webvulncrawl', |
941 | 'wells_search', |
942 | 'wonderer', |
943 | 'wume_crawler', |
944 | 'wwweasel', |
945 | 'xenu\'s_link_sleuth', |
946 | 'xenu_link_sleuth', |
947 | 'xirq', |
948 | 'y!j', # Must come after keyoshid Y!J |
949 | 'yacy', |
950 | 'yahoo\-blogs', |
951 | 'yahoo\-verticalcrawler', |
952 | 'yahoofeedseeker', |
953 | 'yahooseeker\-testing', |
954 | 'yahooseeker', |
955 | 'yahoo\-mmcrawler', |
956 | 'yahoo!_mindset', |
957 | 'yandex', |
958 | 'flexum', |
959 | 'yanga', |
960 | 'yooglifetchagent', |
961 | 'z\-add_link_checker', |
962 | 'zealbot', |
963 | 'zhuaxia', |
964 | 'zspider', |
965 | 'zeus', |
966 | 'ng\/1\.', # put at end to avoid false positive |
967 | 'ng\/2\.', # put at end to avoid false positive |
968 | 'exabot', # put at end to avoid false positive |
969 | # Other id that are 99% of robots |
970 | 'wget', |
971 | 'libwww', |
972 | 'java\/[0-9]' # put at end to avoid false positive |
973 | ); |
974 | @RobotsSearchIDOrder_listgen = ( |
975 | # Generic robot |
976 | 'robot', |
977 | 'checker', |
978 | 'crawl', |
979 | 'discovery', |
980 | 'hunter', |
981 | 'scanner', |
982 | 'spider', |
983 | 'sucker', |
984 | 'bot[\s_+:,\.\;\/\\\-]', |
985 | '[\s_+:,\.\;\/\\\-]bot', |
986 | 'no_user_agent' |
987 | ); |
988 | |
989 | |
990 | |
991 | # RobotsHashIDLib |
992 | # List of robots names ('robot id','robot clear text') |
993 | #------------------------------------------------------- |
994 | %RobotsHashIDLib = ( |
995 | # Common robots (In robot file) |
996 | 'appie','<a href="http://www.walhello.com/" title="Bot home page [new window]" target="_blank">Walhello appie</a>', |
997 | 'architext','ArchitextSpider', |
998 | 'jeeves','<a href="http://sp.ask.com/docs/about/tech_crawling.html" title="Bot home page [new window]" target="_blank">Ask</a>', |
999 | 'bjaaland','Bjaaland', |
1000 | 'ferret','Wild Ferret Web Hopper #1, #2, #3', |
1001 | 'contentmatch','<a href="http://p4p.cn.yahoo.com">Yahoo!China ContentMatch Crawler</a>', |
1002 | 'googlebot','<a href="http://www.google.com/bot.html" title="Bot home page [new window]" target="_blank">Googlebot</a>', |
1003 | 'google\-sitemaps', 'Google Sitemaps', |
1004 | 'gulliver','Northern Light Gulliver', |
1005 | 'virus[_+ ]detector','<a href="http://www.securecomputing.com/" title="virus_harvester@securecomputing.com; Bot home page [new window]" target="_blank">virus_detector</a>', |
1006 | 'harvest','Harvest', |
1007 | 'htdig','ht://Dig', |
1008 | 'linkwalker','LinkWalker', |
1009 | 'lilina','Lilina', |
1010 | 'lycos[_+ ]','Lycos', |
1011 | 'moget','moget', |
1012 | 'muscatferret','Muscat Ferret', |
1013 | 'myweb','Internet Shinchakubin', |
1014 | 'nomad','Nomad', |
1015 | 'scooter','Scooter', |
1016 | 'slurp','<a href="http://help.yahoo.com/help/us/ysearch/slurp/" title="Bot home page [new window]" target="_blank">Yahoo Slurp</a>', |
1017 | '^voyager\/','Voyager', |
1018 | 'weblayers','Weblayers', |
1019 | # Common robots (Not in robot file) |
1020 | 'antibot','Antibot', |
1021 | 'bruinbot','<a href="http://web.archive.org/" title="BruinBot home page [new window]" target="_blank">The web archive</a>', |
1022 | 'digout4u','Digout4u', |
1023 | 'echo!','EchO!', |
1024 | 'fast\-webcrawler','Fast-Webcrawler', |
1025 | 'ia_archiver\-web\.archive\.org','<a href="http://web.archive.org/" title="Bot home page [new window]" target="_blank">The web archive (IA Archiver)</a>', |
1026 | 'ia_archiver','<a href="http://www.alexa.com/" title="Bot home page [new window]" target="_blank">Alexa (IA Archiver)</a>', |
1027 | 'jennybot','JennyBot', |
1028 | 'mercator','Mercator', |
1029 | 'msnbot\-media','<a href="http://search.msn.com/msnbot.htm" title="Bot home page [new window]" target="_blank">MSNBot-media</a>', |
1030 | 'msnbot','<a href="http://search.msn.com/msnbot.htm" title="Bot home page [new window]" target="_blank">MSNBot</a>', |
1031 | 'netcraft','<a href="http://www.netcraft.com/survey/" title="Bot home page [new window]" target="_blank">Netcraft</a>', |
1032 | 'petersnews','Petersnews', |
1033 | 'unlost_web_crawler','Unlost Web Crawler', |
1034 | 'voila','Voila', |
1035 | 'webbase', 'WebBase', |
1036 | 'zyborg','<a href="http://www.WISEnutbot.com/" title="wn-14.zyborg@looksmart.net Bot home page [new window]" target="_blank">ZyBorg</a>', |
1037 | 'wisenutbot','<a href="http://www.WISEnutbot.com/" title="Bot home page [new window]" target="_blank">WISENutbot</a>', |
1038 | 'webcollage','<a href="http://www.jwz.org/webcollage/" title="WebCollage home page [new window]" target="_blank">WebCollage</a>', |
1039 | 'cfetch','<a href="http://www.kosmix.com/crawler.html" title="kosmix home page [new window]" target="_blank">Cfetch</a>', |
1040 | # Less common robots (In robot file) |
1041 | '[^a]fish','Fish search', |
1042 | 'abcdatos','ABCdatos BotLink', |
1043 | 'acme\.spider','Acme.Spider', |
1044 | 'ahoythehomepagefinder','Ahoy! The Homepage Finder', |
1045 | 'alkaline','Alkaline', |
1046 | 'anthill','Anthill', |
1047 | 'arachnophilia','Arachnophilia', |
1048 | 'arale','Arale', |
1049 | 'araneo','Araneo', |
1050 | 'aretha','Aretha', |
1051 | 'ariadne','ARIADNE', |
1052 | 'powermarks','<a href="http://www.kaylon.com/power.html" title="Bot home page [new window]" target="_blank">Powermarks</a>', # must come before Arks; seen used by referrer spam |
1053 | 'arks','arks', |
1054 | 'aspider','ASpider (Associative Spider)', |
1055 | 'atn\.txt','ATN Worldwide', |
1056 | 'atomz','Atomz.com Search Robot', |
1057 | 'auresys','AURESYS', |
1058 | 'backrub','BackRub', |
1059 | 'bbot','BBot', |
1060 | 'bigbrother','Big Brother', |
1061 | 'blackwidow','BlackWidow', |
1062 | 'blindekuh','Die Blinde Kuh', |
1063 | 'bloodhound','Bloodhound', |
1064 | 'borg\-bot','Borg-Bot', |
1065 | 'brightnet','bright.net caching robot', |
1066 | 'bspider','BSpider', |
1067 | 'cactvschemistryspider','CACTVS Chemistry Spider', |
1068 | 'calif[^r]','Calif', |
1069 | 'cassandra','Cassandra', |
1070 | 'cgireader','Digimarc Marcspider/CGI', |
1071 | 'checkbot','Checkbot', |
1072 | 'christcrawler','ChristCrawler.com', |
1073 | 'churl','churl', |
1074 | 'cienciaficcion','cIeNcIaFiCcIoN.nEt', |
1075 | 'collective','Collective', |
1076 | 'combine','Combine System', |
1077 | 'conceptbot','Conceptbot', |
1078 | 'coolbot','CoolBot', |
1079 | 'core','Web Core / Roots', |
1080 | 'cosmos','XYLEME Robot', |
1081 | 'cruiser','Internet Cruiser Robot', |
1082 | 'cusco','Cusco', |
1083 | 'cyberspyder','CyberSpyder Link Test', |
1084 | 'desertrealm','Desert Realm Spider', |
1085 | 'deweb','DeWeb(c) Katalog/Index', |
1086 | 'dienstspider','DienstSpider', |
1087 | 'digger','Digger', |
1088 | 'diibot','Digital Integrity Robot', |
1089 | 'direct_hit','Direct Hit Grabber', |
1090 | 'dnabot','DNAbot', |
1091 | 'download_express','DownLoad Express', |
1092 | 'dragonbot','DragonBot', |
1093 | 'dwcp','DWCP (Dridus\' Web Cataloging Project)', |
1094 | 'e\-collector','e-collector', |
1095 | 'ebiness','EbiNess', |
1096 | 'elfinbot','ELFINBOT', |
1097 | 'emacs','Emacs-w3 Search Engine', |
1098 | 'emcspider','ananzi', |
1099 | 'esther','Esther', |
1100 | 'evliyacelebi','Evliya Celebi', |
1101 | 'fastcrawler','FastCrawler', |
1102 | 'feedcrawl','FeedCrawl by feed@aobo.com', |
1103 | 'fdse','Fluid Dynamics Search Engine robot', |
1104 | 'felix','Felix IDE', |
1105 | 'fetchrover','FetchRover', |
1106 | 'fido','fido', |
1107 | 'finnish','H���ki', |
1108 | 'fireball','KIT-Fireball', |
1109 | 'fouineur','Fouineur', |
1110 | 'francoroute','Robot Francoroute', |
1111 | 'freecrawl','Freecrawl', |
1112 | 'funnelweb','FunnelWeb', |
1113 | 'gama','gammaSpider, FocusedCrawler', |
1114 | 'gazz','gazz', |
1115 | 'gcreep','GCreep', |
1116 | 'getbot','GetBot', |
1117 | 'geturl','GetURL', |
1118 | 'golem','Golem', |
1119 | 'gougou','GouGou', |
1120 | 'grapnel','Grapnel/0.01 Experiment', |
1121 | 'griffon','Griffon', |
1122 | 'gromit','Gromit', |
1123 | 'gulperbot','Gulper Bot', |
1124 | 'hambot','HamBot', |
1125 | 'havindex','havIndex', |
1126 | 'hometown','Hometown Spider Pro', |
1127 | 'htmlgobble','HTMLgobble', |
1128 | 'hyperdecontextualizer','Hyper-Decontextualizer', |
1129 | 'iajabot','iajaBot', |
1130 | 'iaskspider','<a href="http://www.iask.com/" target="_blank">Sina Iask Spider</a>', |
1131 | 'hl_ftien_spider','<a href="http://www.hylanda.com/" target="_blank">Hylanda</a>', |
1132 | 'sogou','<a href="http://www.sogou.com/" target="_blank">Sogou Spider</a>', |
1133 | 'iconoclast','Popular Iconoclast', |
1134 | 'ilse','Ingrid', |
1135 | 'imagelock','Imagelock', |
1136 | 'incywincy','IncyWincy', |
1137 | 'informant','Informant', |
1138 | 'infoseek','InfoSeek Robot 1.0', |
1139 | 'infoseeksidewinder','Infoseek Sidewinder', |
1140 | 'infospider','InfoSpiders', |
1141 | 'inspectorwww','Inspector Web', |
1142 | 'intelliagent','IntelliAgent', |
1143 | 'irobot','I, Robot', |
1144 | 'iron33','Iron33', |
1145 | 'israelisearch','Israeli-search', |
1146 | 'javabee','JavaBee', |
1147 | 'jbot','JBot Java Web Robot', |
1148 | 'jcrawler','JCrawler', |
1149 | 'jobo','JoBo Java Web Robot', |
1150 | 'jobot','Jobot', |
1151 | 'joebot','JoeBot', |
1152 | 'jubii','The Jubii Indexing Robot', |
1153 | 'jumpstation','JumpStation', |
1154 | 'kapsi','image.kapsi.net', |
1155 | 'katipo','Katipo', |
1156 | 'kilroy','Kilroy', |
1157 | 'ko[_+ ]yappo[_+ ]robot','KO_Yappo_Robot', |
1158 | 'kummhttp','<a href="http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_g_l_301105_2\b" title="Bot documentation page [new window]" target="_blank">KummHttp</a>', |
1159 | 'labelgrabber\.txt','LabelGrabber', |
1160 | 'larbin','<a href="http://para.inria.fr/~ailleret/larbin/index-eng.html" title="Bot home page [new window]" target="_blank">larbin</a>', |
1161 | 'legs','legs', |
1162 | 'linkidator','Link Validator', |
1163 | 'linkscan','LinkScan', |
1164 | 'lockon','Lockon', |
1165 | 'logo_gif','logo.gif Crawler', |
1166 | 'macworm','Mac WWWWorm', |
1167 | 'lmspider','<a href="http://www.nuance.com/" title="Bot home page lmspider@scansoft.com [new window]" target="_blank">lmspider</a>', |
1168 | 'lwp\-request','<a href="http://search.cpan.org/~gaas/libwww-perl-5.69/bin/lwp-request" title="lwp-request home page [new window]" target="_blank">lwp-request</a>', |
1169 | 'lwp\-trivial','<a href="http://search.cpan.org/src/GAAS/libwww-perl-5.805/lib/LWP/Simple.pm" title="lwp-trivial home page [new window]" target="_blank">lwp-trivial</a>', |
1170 | 'magpie','<a href="http://magpierss.sf.net/" title="Bot home page [new window]" target="_blank">MagpieRSS</a>', |
1171 | 'marvin','marvin/infoseek', |
1172 | 'mattie','Mattie', |
1173 | 'mediafox','MediaFox', |
1174 | 'merzscope','MerzScope', |
1175 | 'meshexplorer','NEC-MeshExplorer', |
1176 | 'mindcrawler','MindCrawler', |
1177 | 'mnogosearch','mnoGoSearch search engine software', |
1178 | 'momspider','MOMspider', |
1179 | 'monster','Monster', |
1180 | 'motor','Motor', |
1181 | 'muncher','Muncher', |
1182 | 'mwdsearch','Mwd.Search', |
1183 | 'ndspider','NDSpider', |
1184 | 'nederland\.zoek','Nederland.zoek', |
1185 | 'netcarta','NetCarta WebMap Engine', |
1186 | 'netmechanic','<a href="http://www.netmechanic.com/" title="Bot home page [new window]" target="_blank">NetMechanic</a>', |
1187 | 'netscoop','NetScoop', |
1188 | 'newscan\-online','newscan-online', |
1189 | 'nhse','NHSE Web Forager', |
1190 | 'northstar','The NorthStar Robot', |
1191 | 'nzexplorer','nzexplorer', |
1192 | 'objectssearch','ObjectsSearch', |
1193 | 'occam','Occam', |
1194 | 'octopus','HKU WWW Octopus', |
1195 | 'openfind','Openfind data gatherer', |
1196 | 'orb_search','Orb Search', |
1197 | 'packrat','Pack Rat', |
1198 | 'pageboy','PageBoy', |
1199 | 'parasite','ParaSite', |
1200 | 'patric','Patric', |
1201 | 'pegasus','pegasus', |
1202 | 'perignator','The Peregrinator', |
1203 | 'perlcrawler','PerlCrawler 1.0', |
1204 | 'phantom','Phantom', |
1205 | 'phpdig','PhpDig', |
1206 | 'piltdownman','PiltdownMan', |
1207 | 'pimptrain','Pimptrain.com\'s robot', |
1208 | 'pioneer','Pioneer', |
1209 | 'pitkow','html_analyzer', |
1210 | 'pjspider','Portal Juice Spider', |
1211 | 'plumtreewebaccessor','PlumtreeWebAccessor', |
1212 | 'poppi','Poppi', |
1213 | 'portalb','PortalB Spider', |
1214 | 'psbot','<a href="http://www.picsearch.com/bot.html" title="Bot home page" target="_blank">psbot</a>', |
1215 | 'python','<a href="http://docs.python.org/library/urllib.html" title="Tools developed using a Python library" target="_blank">Python-urllib</a>', |
1216 | 'raven','Raven Search', |
1217 | 'rbse','RBSE Spider', |
1218 | 'resumerobot','Resume Robot', |
1219 | 'rhcs','RoadHouse Crawling System', |
1220 | 'road_runner','Road Runner: The ImageScape Robot', |
1221 | 'robbie','Robbie the Robot', |
1222 | 'robi','ComputingSite Robi/1.0', |
1223 | 'robocrawl','RoboCrawl Spider', |
1224 | 'robofox','RoboFox', |
1225 | 'robozilla','Robozilla', |
1226 | 'roverbot','Roverbot', |
1227 | 'rules','RuLeS', |
1228 | 'safetynetrobot','SafetyNet Robot', |
1229 | 'search\-info','Sleek', |
1230 | 'search_au','Search.Aus-AU.COM', |
1231 | 'searchprocess','SearchProcess', |
1232 | 'senrigan','Senrigan', |
1233 | 'sgscout','SG-Scout', |
1234 | 'shaggy','ShagSeeker', |
1235 | 'shaihulud','Shai\'Hulud', |
1236 | 'sift','Sift', |
1237 | 'simbot','Simmany Robot Ver1.0', |
1238 | 'site\-valet','Site Valet', |
1239 | 'sitetech','SiteTech-Rover', |
1240 | 'skymob','Skymob.com', |
1241 | 'slcrawler','SLCrawler', |
1242 | 'smartspider','Smart Spider', |
1243 | 'snooper','Snooper', |
1244 | 'solbot','Solbot', |
1245 | 'speedy','<a href="http://www.entireweb.com/about/search_tech/speedyspider/" title="Speedy Spider home page [new window]" target="_blank">Speedy Spider</a>', |
1246 | 'spider[_+ ]monkey','Spider monkey', |
1247 | 'spiderbot','SpiderBot', |
1248 | 'spiderline','Spiderline Crawler', |
1249 | 'spiderman','<a href="http://www.iscrawling.com" title="Spiderman home page [new window]" target="_blank">Spiderman</a>', |
1250 | 'spiderview','SpiderView(tm)', |
1251 | 'spry','Spry Wizard Robot', |
1252 | 'ssearcher','Site Searcher', |
1253 | 'sqworm','<a href="http://www.websense.com/" title="Bot home page (source: http://www.pgts.com.au/) [new window]" target="_blank">Sqworm</a>', |
1254 | 'suke','Suke', |
1255 | 'sunrise','<a href="http://www.sunrisexp.com/" title="Sunrise home page [new window]" target="_blank">Sunrise</a>', |
1256 | 'suntek','suntek search engine', |
1257 | 'sven','Sven', |
1258 | 'tach_bw','TACH Black Widow', |
1259 | 'tagyu_agent','<a href="http://www.tagyu.com/" title="Bot home page [new window]" target="_blank">Tagyu Agent</a>', |
1260 | 'tarantula','Tarantula', |
1261 | 'tarspider','tarspider', |
1262 | 'tailrank','<a href="http://tailrank.com/robot">TailRank</a>', |
1263 | 'techbot','TechBOT', |
1264 | 'templeton','Templeton', |
1265 | 'titan','TITAN', |
1266 | 'titin','TitIn', |
1267 | 'tkwww','The TkWWW Robot', |
1268 | 'tlspider','TLSpider', |
1269 | 'ucsd','UCSD Crawl', |
1270 | 'udmsearch','UdmSearch', |
1271 | 'universalfeedparser','<a href="http://feedparser.org/" title="Bot home page [new window]" target="_blank">UniversalFeedParser</a>', |
1272 | 'urlck','URL Check', |
1273 | 'valkyrie','Valkyrie', |
1274 | 'verticrawl','Verticrawl', |
1275 | 'victoria','Victoria', |
1276 | 'visionsearch','vision-search', |
1277 | 'voidbot','void-bot', |
1278 | 'vwbot','VWbot', |
1279 | 'w3index','The NWI Robot', |
1280 | 'w3m2','W3M2', |
1281 | 'wallpaper','WallPaper (alias crawlpaper)', |
1282 | 'wanderer','the World Wide Web Wanderer', |
1283 | 'wapspider','w@pSpider by wap4.com', |
1284 | 'webbandit','WebBandit Web Spider', |
1285 | 'webcatcher','WebCatcher', |
1286 | 'webcopy','WebCopy', |
1287 | 'webfetcher','webfetcher', |
1288 | 'webfoot','The Webfoot Robot', |
1289 | 'webinator','Webinator', |
1290 | 'weblinker','WebLinker', |
1291 | 'webmirror','WebMirror', |
1292 | 'webmoose','The Web Moose', |
1293 | 'webquest','WebQuest', |
1294 | 'webreader','Digimarc MarcSpider', |
1295 | 'webreaper','WebReaper', |
1296 | 'websnarf','Websnarf', |
1297 | 'webspider','WebSpider', |
1298 | 'webvac','WebVac', |
1299 | 'webwalk','webwalk', |
1300 | 'webwalker','WebWalker', |
1301 | 'webwatch','WebWatch', |
1302 | 'whatuseek','whatUseek Winona', |
1303 | 'whowhere','WhoWhere Robot', |
1304 | 'wired\-digital','Wired Digital', |
1305 | 'wmir','w3mir', |
1306 | 'wolp','WebStolperer', |
1307 | 'wombat','The Web Wombat', |
1308 | 'wordpress','<a href="http://wordpress.org/" title="WordPress home page [new window]" target="_blank">WordPress</a>', |
1309 | 'worm','The World Wide Web Worm', |
1310 | 'woozweb','Woozweb Monitoring', |
1311 | 'wwwc','WWWC Ver 0.2.5', |
1312 | 'wz101','WebZinger', |
1313 | 'xget','XGET', |
1314 | # Other robots reported by users |
1315 | '1\-more_scanner','<a href="http://www.myzips.com/software/1-More-Scanner.phtml" title="1-More Scanner home page [new window]" target="_blank">1-More Scanner</a>', |
1316 | 'accoona\-ai\-agent','<a href="http://www.accoona.com/" title="Accoona-AI-Agent home page [new window]" target="_blank">Accoona-AI-Agent</a>', |
1317 | 'activebookmark','<a href="http://www.libmaster.com/active_bookmark.php" title="ActiveBookmark home page [new window]" target="_blank">ActiveBookmark</a>', |
1318 | 'adamm_bot','<a href="http://home.blic.net/adamm/" title="Bot home page [new window]" target="_blank">AdamM Bot</a>', |
1319 | 'almaden','<a href="http://www.almaden.ibm.com/cs/crawler" title="IBM Almaden Research Center WebFountain™ Bot home page [new window]" target="_blank">IBM Almaden</a> Research Center WebFountain™', |
1320 | 'aipbot','<a href="http://www.aipbot.com/" title="aipbot@aipbot.com Bot home page [new window]" target="_blank">aipbot</a>', |
1321 | 'aleadsoftbot','<a href="http://www.aleadsoft.com/bot.htm" title="ALeadSoftbot home page [new window]" target="_blank">ALeadSoftbot</a>', |
1322 | 'alpha_search_agent','Alpha Search Agent', |
1323 | 'allrati','Allrati', |
1324 | 'aport', 'Aport', |
1325 | 'archive\.org_bot','<a href="http://crawls.archive.org/collections/bncf/crawl.html" title="Bot home page [new window]" target="_blank">archive.org bot</a>', |
1326 | 'argus','<a href="http://www.simpy.com/bot.html" title="feedback@simpy.com Bot home page [new window]" target="_blank">Argus</a>', |
1327 | 'arianna\.libero\.it','<a href="http://arianna.libero.it/" title="Bot home page [new window]" target="_blank">arianna.libero.it</a>', |
1328 | 'aspseek','<a href="http://www.aspseek.org/" title="Bot home page [new window]" target="_blank">ASPseek</a>', |
1329 | 'asterias', 'Asterias', |
1330 | 'awbot', 'AWBot', |
1331 | 'baiduspider','<a href="http://www.baidu.com/search/spider.html" title="Bot home page [new window]" target="_blank">BaiDuSpider</a>', |
1332 | 'becomebot', '<a href="http://www.become.com/site_owners.html" title="Bot home page [new window]" target="_blank">BecomeBot</a>', |
1333 | 'bender','<a href="http://bender.ucr.edu/" title="Bot home page [new window]" target="_blank">bender</a> <a href="http://ivia.ucr.edu/manuals/NiFC/current/index.shtml" title="Bot home page [new window]" target="_blank">focused_crawler</a>', |
1334 | 'betabot','BetaBot', |
1335 | 'biglotron','<a href="http://www.biglotron.com/robot.html" title="Bot home page [new window]" target="_blank">Biglotron</a>', |
1336 | 'bittorrent_bot','<a href="http://www.bittorrent.com/" title="Bot home page [new window]" target="_blank">BitTorrent Bot</a>', |
1337 | 'biz360[_+ ]spider','<a href="http://www.biz360.com/" title="blogsmanager@biz360.com Bot home page [new window]" target="_blank">Biz360 spider</a>', |
1338 | 'blogbridge[_+ ]service','<a href="http://www.blogbridge.com/" title="Bot home page [new window]" target="_blank">BlogBridge Service</a>', |
1339 | 'bloglines','<a href="http://www.bloglines.com/" title="Bot home page [new window]" target="_blank">Bloglines</a>', |
1340 | 'blogpulse','<a href="http://www.intelliseek.com/" title="Bot home page [new window]" target="_blank">BlogPulse ISSpider intelliseek.com</a>', |
1341 | 'blogsearch','<a href="http://www.icerocket.com/" title="Bot home page [new window]" target="_blank">BlogSearch</a>', |
1342 | 'blogshares','<a href="http://blogshares.com/help.php?node=7" title="Bot home page [new window]" target="_blank">Blogshares Spiders</a>', |
1343 | 'blogslive','<a href="http://www.blogslive.com/" title="info@blogslive.com Bot home page [new window]" target="_blank">Blogslive</a>', |
1344 | 'blogssay','<a href="http://www.blogssay.com/" title="Bot home page [new window]" target="_blank">BlogsSay :: RSS Search Crawler</a>', |
1345 | 'bncf\.firenze\.sbn\.it\/raccolta\.txt','<a href="http://www.bncf.firenze.sbn.it/raccolta.txt" title="Bot home page [new window]" target="_blank">Biblioteca Nazionale Centrale di Firenze</a>', |
1346 | 'bobby', 'Bobby', |
1347 | 'boitho\.com\-dc','<a href="http://www.boitho.com/dcbot.html" title="Bot home page [new window]" target="_blank">boitho.com-dc</a>', |
1348 | 'bookmark\-manager','<a href="http://bkm.sourceforge.net/" title="Bookmark-Manager home page [new window]" target="_blank">Bookmark-Manager</a>', |
1349 | 'boris', 'Boris', |
1350 | 'bumblebee', 'Bumblebee (relevare.com)', |
1351 | 'candlelight[_+ ]favorites[_+ ]inspector','<a href="http://www.candlelight.com/home.html" title="Candlelight_Favorites_Inspector home page [new window]" target="_blank">Candlelight_Favorites_Inspector</a>', |
1352 | 'cbn00glebot','cbn00glebot', |
1353 | 'cerberian_drtrs','<a href="http://www.pgts.com.au/cgi-bin/psql?robot_info=25240" title="Bot home page [new window]" target="_blank">Cerberian Drtrs</a>', |
1354 | 'cfnetwork','<a href="http://www.cocoadev.com/index.pl?CFNetwork" title="CFNetwork home page [new window]" target="_blank">CFNetwork</a>', |
1355 | 'cipinetbot','<a href="http://www.cipinet.com/bot.html" title="CipinetBot home page [new window]" target="_blank">CipinetBot</a>', |
1356 | 'checkweb_link_validator','<a href="http://p.duby.free.fr/chkweb.htm" title="CheckWeb link validator home page [new window]" target="_blank">CheckWeb link validator</a>', |
1357 | 'commons\-httpclient','<a href="http://jakarta.apache.org/commons/httpclient/" title="Bot home page [new window]" target="_blank">Jakarta commons-httpclient</a>', |
1358 | 'computer_and_automation_research_institute_crawler','<a href="http://www.ilab.sztaki.hu/~stamas/publications/p184-benczur.html" title="Computer and Automation Research Institute Crawler home page [new window]" target="_blank">Computer and Automation Research Institute Crawler</a>', |
1359 | 'converamultimediacrawler','<a href="http://www.authoritativeweb.com/crawl/" title="ConveraMultiMediaCrawler home page [new window]" target="_blank">ConveraMultiMediaCrawler</a>', |
1360 | 'converacrawler','<a href="http://www.authoritativeweb.com/crawl/" title="ConveraCrawler home page [new window]" target="_blank">ConveraCrawler</a>', |
1361 | 'cscrawler','CsCrawler', |
1362 | 'cse_html_validator_lite_online','<a href="http://online.htmlvalidator.com/php/onlinevallite.php" title="CSE HTML Validator Lite Online home page [new window]" target="_blank">CSE HTML Validator Lite Online</a>','cuasarbot','<a href="http://www.cuasar.com/" title="Cuasarbot home page [new window]" target="_blank">Cuasarbot</a>', |
1363 | 'cursor','<a href="http://adcenter.hu/docs/en/bot.html " title="Cursor home page [new window]" target="_blank">Cursor</a>', |
1364 | 'custo','<a href="http://www.netwu.com/custo/" title="Custo home page [new window]" target="_blank">Custo</a>', |
1365 | 'datafountains\/dmoz_downloader','<a href="http://infomine.ucr.edu/ " title="DataFountains/DMOZ Downloader home page [new window]" target="_blank">DataFountains/DMOZ Downloader</a>', |
1366 | 'daviesbot', 'DaviesBot', |
1367 | 'daypopbot', 'DayPop', |
1368 | 'deepindex','<a href="http://www.deepindex.net/faq.php" title="Deepindex home page [new window]" target="_blank">Deepindex</a>', |
1369 | 'dipsie\.bot','<a href="http://www.dipsie.com/bot/" title="Bot home page [new window]" target="_blank">Dipsie</a>', |
1370 | 'dnsgroup','<a href="http://www.dnsgroup.com/" title="DNSGroup home page [new window]" target="_blank">DNSGroup</a>', |
1371 | 'domainchecker','<a href="http://net-promoter.com/" title="DomainChecker home page (not confirmed) [new window]" target="_blank">DomainChecker</a>', |
1372 | 'domainsdb\.net','<a href="http://domainsdb.net/" title="Bot home page [new window]" target="_blank">DomainsDB.net</a>', |
1373 | 'dulance','<a href="http://www.dulance.com/bot.jsp" title="Bot home page [new window]" target="_blank">Dulance</a>', |
1374 | 'dumbot','<a href="http://www.dumbfind.com/" title="Dumbot home page [new window]" target="_blank">Dumbot</a>', |
1375 | 'dumm\.de\-bot','<a href="http://www.dumm.de/" title="dumm.de-Bot home page [new window]" target="_blank">dumm.de-Bot</a>', |
1376 | 'earthcom\.info','<a href="http://www.earthcom.info/" title="Bot home page [new window]" target="_blank">EARTHCOM.info</a>', |
1377 | 'easydl','<a href="http://keywen.com/Encyclopedia/Bot/" title="EasyDL home page [new window]" target="_blank">EasyDL</a>', |
1378 | 'edgeio\-retriever','<a href="http://www.edgeio.com/" title="Bot home page [new window]" target="_blank">edgeio-retriever</a>', |
1379 | 'ets_v','<a href="http://www.freetranslation.com/help/" title="ETS home page [new window]" target="_blank">ETS</a> Enterprise Translation Server', |
1380 | 'exactseek','ExactSeek Crawler', |
1381 | 'extreme[_+ ]picture[_+ ]finder','<a href="http://www.exisoftware.com/" title="Extreme_Picture_Finder home page [new window]" target="_blank">Extreme_Picture_Finder</a>', |
1382 | 'eventax','<a href="http://www.eventax.de/" title="eventax home page [new window]" target="_blank">eventax</a>', |
1383 | 'everbeecrawler','EverbeeCrawler', |
1384 | 'everest\-vulcan','<a href="http://everest.vulcan.com/crawlerhelp" title="Bot home page [new window]" target="_blank">Everest-Vulcan</a>', |
1385 | 'ezresult', 'Ezresult', |
1386 | 'enteprise','<a href="http://www.fastsearch.com/" title="Bot home page [new window]" target="_blank">Fast Enteprise Crawler</a>', |
1387 | 'facebook','FaceBook bot', |
1388 | 'fast\-search\-engine','<a href="http://www.fast-search-engine.com/" title="Bot home page [new window]" target="_blank">Fast-Search-Engine</a> (not fastsearch.com)', |
1389 | 'fast_enterprise_crawler','<a href="http://www.fast.no/" title="FAST Enterprise Crawler home page [new window]" target="_blank">FAST Enterprise Crawler</a>', |
1390 | 'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de','<a href="http://www.telekom.de/" title="FAST Enterprise Crawler * crawleradmin.t-info@telekom.de home page [new window]" target="_blank">FAST Enterprise Crawler * crawleradmin.t-info@telekom.de</a>', |
1391 | 'matrix_s\.p\.a\._\-_fast_enterprise_crawler','<a href="http://tin.virgilio.it/" title="Matrix S.p.A. - FAST Enterprise Crawler home page [new window]" target="_blank">Matrix S.p.A. - FAST Enterprise Crawler</a>', |
1392 | 'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de','<a href="http://www.telekom.de/" title="FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de home page [new window]" target="_blank">FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de</a>', |
1393 | 'favicon','FavIconizer', |
1394 | 'favorg','<a href="http://www.pcmag.com/article2/0,4149,108438,00.asp" title="FavOrg home page [new window]" target="_blank">FavOrg</a>', |
1395 | 'favorites_sweeper','<a href="http://www.manitools.com/favsweep/" title="Favorites_Sweeper home page [new window]" target="_blank">Favorites Sweeper</a>', |
1396 | 'feedburner', 'Feedburner', |
1397 | 'feedfetcher\-google','<a href="http://www.google.com/feedfetcher.html" title="Bot home page [new window]" target="_blank">Feedfetcher-Google</a>', |
1398 | 'feedflow','<a href="http://feedflow.com/about" title="Bot home page [new window]" target="_blank">FeedFlow</a>', |
1399 | 'feedster','<a href="http://www.feedster.com/" title="Bot home page [new window]" target="_blank">Feedster</a>', |
1400 | 'feedsky','<a href="http://www.feedsky.com/" title="Bot home page [new window]" target="_blank">FeedSky</a>', |
1401 | 'feedvalidator','<a href="http://feedvalidator.org/" title="FeedValidator home page [new window]" target="_blank">FeedValidator</a>', |
1402 | 'filmkamerabot','<a href="http://www.filmkamera.at/bot.html" title="FilmkameraBot home page [new window]" target="_blank">FilmkameraBot</a>', |
1403 | 'findexa_crawler','<a href="http://www.findexa.no/gulesider/article26548.ece " title="Findexa Crawler home page [new window]" target="_blank">Findexa Crawler</a>', |
1404 | 'geniebot','<a href="http://www.genieknows.com/" title="Bot home page [new window]" target="_blank">Geniebot</a>', |
1405 | 'findlinks','<a href="http://wortschatz.uni-leipzig.de/findlinks/" title="Bot home page [new window]" target="_blank">Findlinks</a>', |
1406 | 'fooky\.com\/ScorpionBot','<a href="http://www.fooky.com/scorpionbots" title="Fooky.com/ScorpionBot/ScoutOut home page [new window]" target="_blank">Fooky.com/ScorpionBot/ScoutOut</a>', |
1407 | 'g2crawler','<a href="http://crawler.instantnetworks.net/" title="Bot home page (nobody@airmail.net) [new window]" target="_blank">G2Crawler</a>', |
1408 | 'gaisbot','<a href="http://gais.cs.ccu.edu.tw/robot.php" title="Bot home page [new window]" target="_blank">Gaisbot</a>', |
1409 | 'gigabot','<a href="http://www.gigablast.com/spider.html" title="Bot home page [new window]" target="_blank">GigaBot</a>', |
1410 | 'girafabot','<a href="http://www.girafa.com/" title="Bot home page [new window]" target="_blank">Girafabot</a>', |
1411 | 'global_fetch','<a href="http://www.wesonet.com/" title="Global Fetch home page [new window]" target="_blank">Global Fetch</a>', |
1412 | 'gnodspider','GNOD Spider', |
1413 | 'goforit\.com','<a href="http://www.goforit.com/about/" title="GoForIt.com home page [new window]" target="_blank">GoForIt.com</a>', |
1414 | 'goforitbot','<a href="http://www.goforit.com/about/" title="GOFORITBOT home page [new window]" target="_blank">GOFORITBOT</a>', |
1415 | 'gonzo','<a href="http://www.suchen.de/faq.html" title="Bot home page [new windows]" target="_blank">suchen.de</a>', |
1416 | 'gpu_p2p_crawler','<a href="http://gpu.sourceforge.net/search_engine.php" title="Bot home page [new window]" target="_blank">GPU p2p crawler</a>', |
1417 | 'grub','Grub.org', |
1418 | 'henrythemiragorobot', '<a href="http://www.miragorobot.com/scripts/mrinfo.asp" title="Bot home page [new window]" target="_blank">Mirago</a>', |
1419 | 'heritrix','<a href="http://crawler.archive.org/" title="(used by a few different companies) Bot home page [new window]" target="_blank">Heritrix</a>', |
1420 | 'holmes', 'Holmes', |
1421 | 'hoowwwer','<a href="http://cosco.hiit.fi/search/hoowwwer/" title="HooWWWer home page [new window]" target="_blank">HooWWWer</a>', |
1422 | 'hpprint','HPPrint', |
1423 | 'htmlparser','<a href="http://htmlparser.sourceforge.net/" title="HTMLParser home page [new window]" target="_blank">HTMLParser</a>', |
1424 | 'html[_+ ]link[_+ ]validator','<a href="http://www.lithopssoft.com/ " title="Html_Link_Validator home page [new window]" target="_blank">Html_Link_Validator</a>', |
1425 | 'httrack','<a href="http://www.httrack.com/" title="Bot home page [new window]" target="_blank">HTTrack off-line browser</a>', |
1426 | 'hundesuche\.com\-bot','<a href="http://www.hundesuche.com/" title="Hundesuche.com-Bot home page [new window]" target="_blank">Hundesuche.com-Bot</a>', |
1427 | 'ichiro','<a href="http://help.goo.ne.jp/door/crawlerE.html" title="Bot home page [new window]" target="_blank">ichiro</a>', |
1428 | 'iltrovatore\-setaccio','<a href="http://www.iltrovatore.it/aiuto/motore_di_ricerca.html" title="bot@iltrovatore.it IlTrovatore-Setaccio home page [new window]" target="_blank">IlTrovatore-Setaccio</a>', |
1429 | 'infobot','<a href="http://www.infobot.org/" title="InfoBot home page [new window]" target="_blank">InfoBot</a>', |
1430 | 'infociousbot','<a href="http://corp.infocious.com/tech_crawler.php" title="InfociousBot home page [new window]" target="_blank">InfociousBot</a>', |
1431 | 'infomine','<a href="http://infomine.ucr.edu/useragents" title="Bot home page [new window]" target="_blank">INFOMINE VLCrawler</a>', |
1432 | 'insurancobot','<a href="http://www.fastspywareremoval.com/" title="InsurancoBot home page [new window]" target="_blank">InsurancoBot</a>', |
1433 | 'internet[_+ ]ninja','<a href="http://www.dti.ne.jp/ " title="Internet_Ninja home page [new window]" target="_blank">Internet_Ninja </a>', |
1434 | 'internetarchive','<a href="http://lucene.apache.org/nutch/bot.html " title="InternetArchive home page [new window]" target="_blank">InternetArchive</a>', |
1435 | 'internetseer', 'InternetSeer', |
1436 | 'internetsupervision','<a href="http://internetsupervision.com/" title="InternetSupervision home page [new window]" target="_blank">InternetSupervision</a>', |
1437 | 'irlbot','<a href="http://irl.cs.tamu.edu/crawler" title="Bot home page [new window]" target="_blank">IRLbot</a>', |
1438 | 'isearch2006','<a href="http://www.yahoo.com.cn/" title="isearch2006 home page [new window]" target="_blank">isearch2006</a>', |
1439 | 'iupui_research_bot','<a href="http://spamhuntress.com/2005/04/25/a-mail-harvester-visits/" title="IUPUI_Research_Bot home page [new window]" target="_blank">IUPUI_Research_Bot</a>', |
1440 | 'jrtwine[_+ ]software[_+ ]check[_+ ]favorites[_+ ]utility','<a href="http://www.jrtwine.com/Products/CheckFavs/" title="JRTwine_Software_Check_Favorites_Utility home page [new window]" target="_blank">JRTwine_Software_Check_Favorites_Utility</a>', |
1441 | 'justview', 'JustView', |
1442 | 'kalambot','<a href="http://64.124.122.251/feedback.html" title="KalamBot home page [new window]" target="_blank">KalamBot</a>', |
1443 | 'kamano\.de_newsfeedverzeichnis','<a href="http://www.kamano.de/" title="kamano.de NewsFeedVerzeichnis home page [new window]" target="_blank">kamano.de NewsFeedVerzeichnis</a>', |
1444 | 'kazoombot','<a href="http://www.kazoom.ca/bot.html" title="kazoombot@kazoom.ca KazoomBot home page [new window]" target="_blank">KazoomBot</a>', |
1445 | 'kevin','<a href="http://dznet.com/kevin/" title="Kevin home page [new window]" target="_blank">Kevin</a>', |
1446 | 'keyoshid','<a href="http://www.yahoo.co.jp/" title="Bot home page [new window]" target="_blank">Yahoo! Japan keyoshid robot study</a>', |
1447 | 'kinjabot', 'Kinjabot', |
1448 | 'kinja\-imagebot', 'Kinja Imagebot', |
1449 | 'knowitall','<a href="http://www.cs.washington.edu/research/knowitall/" title="KnowItAll home page [new window]" target="_blank">KnowItAll</a>', |
1450 | 'knowledge\.com','<a href="http://www.knowledge.com/" title="Knowledge.com home page [new window]" target="_blank">Knowledge.com</a>', |
1451 | 'kouaa_krawler','<a href="http://www.kouaa.com/" title="Kouaa Krawler home page [new window]" target="_blank">Kouaa Krawler</a>', |
1452 | 'krugle','<a href="http://www.krugle.com/crawler/info.html" title="Bot home page [new window]" target="_blank">Krugle</a>', |
1453 | 'ksibot','<a href="http://ego.ms.mff.cuni.cz/" title="Bot home page [new window]" target="_blank">ksibot</a>', |
1454 | 'kurzor','<a href="http://www.easymail.hu/" title="cursor@easymail.hu Kurzor home page [new window]" target="_blank">Kurzor</a>', |
1455 | 'lanshanbot','<a href="http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_g_l_140406_1%5Cb" title="Bot Information [new window]" target="_blank">lanshanbot</a>', |
1456 | 'letscrawl\.com','<a href="http://letscrawl.com/" title="Bot home page [new window]" target="_blank">LetsCrawl.com</a>', |
1457 | 'libcrawl','Crawl libcrawl', |
1458 | 'link_valet_online','<a href="http://www.htmlhelp.com/tools/valet/" title="Link Valet Online home page [new window]" target="_blank">Link Valet Online</a>', |
1459 | 'linkbot','LinkBot', |
1460 | 'linkchecker','<a href="http://linkchecker.sourceforge.net" title="Bot home page [new window]" target="_blank">LinkChecker</a>', |
1461 | 'livejournal\.com', 'LiveJournal.com', |
1462 | 'magpierss', 'MagpieRSS', |
1463 | 'mail\.ru', 'Mail.ru bot', |
1464 | 'mapoftheinternet\.com','<a href="http://MapoftheInternet.com/" title="MapoftheInternet.com home page [new window]" target="_blank">MapoftheInternet.com</a>', |
1465 | 'mediapartners\-google','<a href="https://adwords.google.com/" title="Bot home page [new window]" target="_blank">Google AdSense</a>', |
1466 | 'megite','<a href="http://www.megite.com/" title="Megite home page [new window]" target="_blank">Megite</a>', |
1467 | 'metager\-linkchecker','MetaGer LinkChecker', |
1468 | 'metaspinner','<a href="http://index.meta-spinner.de/" title="Metaspinner home page [new window]" target="_blank">Metaspinner</a>', |
1469 | 'microsoft[_+ ]url[_+ ]control','<a href="http://www.webmasterworld.com/forum11/1005.htm" title="Microsoft URL Control home page [new window]" target="_blank">Microsoft URL Control</a>', |
1470 | 'minirank','<a href="http://minirank.com/" title="miniRank home page [new window]" target="_blank">miniRank</a>', |
1471 | 'mini\-reptile','Mini-reptile', |
1472 | 'missigua_locator','<a href="http://www.webmasterworld.com/forum11/2690.htm" title="Missigua_Locator home page [new window]" target="_blank">Missigua_Locator</a>', |
1473 | 'misterbot','<a href="http://www.misterbot.fr/" title="Misterbot home page [new window]" target="_blank">Misterbot</a>', |
1474 | 'miva','<a href="http://www.miva.com/" title="Miva home page [new window]" target="_blank">Miva</a>', |
1475 | 'mizzu_labs','<a href="http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_m_141105_2\b " title="Mizzu Labs home page [new window]" target="_blank">Mizzu Labs</a>', |
1476 | 'mj12bot','<a href="http://majestic12.co.uk/bot.php" title="Bot home page. [new window]" target="_blank">MJ12bot</a>', |
1477 | 'mojeekbot','<a href="http://www.mojeek.com/bot.html" title="Bot home page. [new window]" target="_blank">MojeekBot</a>', |
1478 | 'msiecrawler','<a href="http://msdn.microsoft.com/workshop/delivery/offline/linkrel.asp" title="Bot home page. [new window]" target="_blank">MSIECrawler</a>', |
1479 | 'ms_search_4\.0_robot','<a href="http://support.microsoft.com/default.aspx?scid=kb;en-us;284022" title="Bot home page. [new window]" target="_blank">MS SharePoint Portal Server - MS Search 4.0 Robot</a>', |
1480 | 'msrabot','msrabot', |
1481 | 'msrbot','<a href="http://research.microsoft.com/research/sv/msrbot/" title="MSRBOT home page [new window]" target="_blank">MSRBOT</a>', |
1482 | 'mt::telegraph::agent','MT::Telegraph::Agent', |
1483 | 'mydoyouhike','<a href="http://www.doyouhike.net/my" title="Mydoyouhike home page [new window]" target="_blank">Mydoyouhike</a>', |
1484 | 'nagios','Nagios', |
1485 | 'nasa_search','<a href="http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_140506_2\b" title="NASA Search home page [new window]" target="_blank">NASA Search</a>', |
1486 | 'netluchs','<a href="http://www.netluchs.de/" title="Bot home page. [new window]" target="_blank">Netluchs</a>', |
1487 | 'netsprint','<a href="http://www.netsprint.pl/serwis/" title="NetSprint home page [new window]" target="_blank">NetSprint</a>', |
1488 | 'newsgatoronline', 'NewsGator Online', |
1489 | 'nicebot','<a href="http://www.egghelp.org/setup.htm" title="Bot home page (there may be others) [new window]" target="_blank">nicebot</a>', |
1490 | 'nimblecrawler','<a href="http://www.healthline.com/" title="NimbleCrawler home page [new window]" target="_blank">NimbleCrawler</a>', |
1491 | 'noxtrumbot','<a href="http://www.noxtrum.com/" title="Bot home page [new window]" target="_blank">noxtrumbot</a>', |
1492 | 'npbot','<a href="http://www.nameprotect.com/botinfo.html" title="NPBot home page [new window]" target="_blank">NPBot</a>', |
1493 | 'nutchcvs','<a href="http://lucene.apache.org/nutch/bot.html" title="NutchCVS home page [new window]" target="_blank">NutchCVS</a>', |
1494 | 'nutchosu\-vlib','<a href="http://lucene.apache.org/nutch/bot.html" title="NutchOSU-VLIB home page [new window]" target="_blank">NutchOSU-VLIB</a>', |
1495 | 'nutch','<a href="http://lucene.apache.org/nutch/" title="Bot home page. Used by many, including Looksmart. [new window]" target="_blank">Nutch</a>', |
1496 | 'ocelli','<a href="http://www.globalspec.com/Ocelli/" title="Ocelli home page [new window]" target="_blank">Ocelli</a>', |
1497 | 'octora_beta_bot','<a href="http://www.octora.com/" title="Bot home page [new window]" target="_blank">Octora Beta Bot</a>', |
1498 | 'omniexplorer[_+ ]bot','<a href="http://www.omni-explorer.com/" title="Bot home page. [new window]" target="_blank">OmniExplorer Bot</a>', |
1499 | 'onet\.pl[_+ ]sa','<a href="http://szukaj.onet.pl/" title="Onet.pl_SA home page [new window]" target="_blank">Onet.pl_SA</a>', |
1500 | 'onfolio','<a href="http://www.onfolio.com/" title="Bot home page [new window]">Onfolio</a>', |
1501 | 'opentaggerbot','<a href="http://www.opentagger.com/opentaggerbot.htm" title="Bot home page [new window]">OpenTaggerBot</a>', |
1502 | 'openwebspider','<a href="http://www.openwebspider.org/" title="OpenWebSpider home page [new window]" target="_blank">OpenWebSpider</a>', |
1503 | 'oracle_ultra_search','<a href="http://www.oracle.com/technology/products/ultrasearch/index.html" title="Oracle Ultra Search home page [new window]" target="_blank">Oracle Ultra Search</a>', |
1504 | 'orbiter','<a href="http://www.dailyorbit.com/bot.htm" title="Orbiter home page [new window]" target="_blank">Orbiter</a>', |
1505 | 'yodaobot','<a href="http://www.yodao.com/help/webmaster/spider/" title="YodaoBot">OutfoxBot/YodaoBot</a>', |
1506 | 'qihoobot','<a href="http://www.qihoo.com/" title="QihooBot">QihooBot</a>', |
1507 | 'passwordmaker\.org','<a href="http://passwordmaker.org/" title="passwordmaker.org home page [new window]" target="_blank">passwordmaker.org</a>', |
1508 | 'pear_http_request_class','<a href="http://pear.php.net/" title="PEAR HTTP Request class home page [new window]" target="_blank">PEAR HTTP Request class</a>', |
1509 | 'peerbot','<a href="http://www.peerbot.com/" title="PEERbot home page [new window]" target="_blank">PEERbot</a>', |
1510 | 'perman', 'Perman surfer', |
1511 | 'php[_+ ]version[_+ ]tracker','<a href="http://www.nexen.net/phpversion/bot.php" title="PHP Version Tracker home page [new window]" target="_blank">PHP version tracker</a>', |
1512 | 'pictureofinternet','<a href="http://malfunction.org/poi/" title="PictureOfInternet home page [new window]" target="_blank">PictureOfInternet</a>', |
1513 | 'ping\.blo\.gs','<a href="http://blo.gs/ping.php" title="Bot home page. [new window]" target="_blank">ping.blo.gs</a>', |
1514 | 'plinki','<a href="http://www.plinki.com/" title="plinki home page [new window]" target="_blank">plinki</a>', |
1515 | 'pluckfeedcrawler','<a href="http://www.pluck.com/" title="Bot home page. [new window]" target="_blank">PluckFeedCrawler</a>', |
1516 | 'pogodak','<a href="http://www.pogodak.com" title="Pogodak home page [new window]" target="_blank">Pogodak.com</a>', |
1517 | 'pompos','<a href="http://dir.com/pompos.html" title="Bot home page. [new window]" target="_blank">Pompos</a>', |
1518 | 'popdexter','Popdexter', |
1519 | 'port_huron_labs','<a href="http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1133\b" title="Port Huron Labs home page [new window]" target="_blank">Port Huron Labs</a>', |
1520 | 'postfavorites','<a href="http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1135\b " title="PostFavorites home page [new window]" target="_blank">PostFavorites</a>', |
1521 | 'projectwf\-java\-test\-crawler','ProjectWF-java-test-crawler', |
1522 | 'proodlebot','<a href="http://www.proodle.com/" title="proodleBot home page [new window]" target="_blank">proodleBot</a>', |
1523 | 'pyquery','<a href="http://sourceforge.net/projects/pyquery/" title="PyQuery home page [new window]" target="_blank">PyQuery</a>', |
1524 | 'rambler','<a href="http://www.rambler.ru/doc/faq.shtml" title="Bot home page [new window]">StackRambler</a>', |
1525 | 'redalert','Red Alert', |
1526 | 'relevantnoise\.com', '<a href="http://www.relevantnoise.com/" title="Relevant Noise [new window]" target="_blank">Relevant Noise</a>', |
1527 | 'rojo','<a href="http://rojo.com/" title="Bot home page [new window]" target="_blank">RoJo</a> aggregator', |
1528 | 'rssimagesbot','<a href="http://herbert.groot.jebbink.nl/?app=rssImages" title="Bot home page [new window]" target="_blank">rssImagesBot</a>', |
1529 | 'ruffle','<a href="http://www.unreach.net/" title="Bot home page [new window]" target="_blank">ruffle SemanticWeb crawler</a>', |
1530 | 'rufusbot','<a href="http://64.124.122.252.webaroo.com/feedback.html" title="Bot home page [new window]" target="_blank">RufusBot Rufus Web Miner</a>', |
1531 | 'sandcrawler','<a href="http://www.microsoft.com/" title="Bot home page [new window]" target="_blank">SandCrawler (Microsoft)</a>', |
1532 | 'sbider','<a href="http://www.sitesell.com/sbider.html" title="Bot home page [new window]" target="_blank">SBIder</a>', |
1533 | 'schizozilla','<a href="http://spamhuntress.com/2005/03/18/gizmo/ " title="Schizozilla home page [new window]" target="_blank">Schizozilla</a>', |
1534 | 'scumbot','Scumbot', |
1535 | 'searchguild[_+ ]dmoz[_+ ]experiment','<a href="http://www.searchguild.com/" title="SearchGuild_DMOZ_Experiment home page [new window]" target="_blank">SearchGuild_DMOZ_Experiment</a>', |
1536 | 'seekbot','<a href="http://www.seekbot.net/bot.html" title="Bot home page [new window]">Seekbot</a>', |
1537 | 'sensis_web_crawler','<a href="http://www.sensis.com.au/" title="Sensis Web Crawler home page [new window]" target="_blank">Sensis Web Crawler</a>', |
1538 | 'seznambot','<a href="http://fulltext.seznam.cz/" title="Bot home page [new window]" target="_blank">SeznamBot</a>', |
1539 | 'shim\-crawler','<a href="http://www.logos.ic.i.u-tokyo.ac.jp/crawler/" title="crawl@logos.ic.i.u-tokyo.ac.jp Bot home page [new window]" target="_blank">Shim-Crawler</a>', |
1540 | 'shoutcast','Shoutcast Directory Service', |
1541 | 'slysearch','SlySearch', |
1542 | 'snap\.com_beta_crawler','<a href="http://www.snap.com/" title="snap.com beta crawler home page [new window]" target="_blank">snap.com beta crawler</a>', |
1543 | 'sohu\-search','<a href="http://corp.sohu.com/" title="Bot home page [new window]" target="_blank">sohu-search</a>', |
1544 | 'sohu','<a href="http://corp.sohu.com/" title="Bot home page [new window]" target="_blank">sohu agent</a>', |
1545 | 'snappy','<a href="http://www.urltrends.com/faq.php" title="Bot home page [new window]" target="_blank">Snappy</a>', |
1546 | 'sphere_scout','<a href="http://www.sphere.com/" title="Bot home page [new window]" target="_blank">Sphere Scout</a>', |
1547 | 'spip','<a href="http://www.spip.net" title="SPIP home page [new window]" target="_blank">SPIP</a>', |
1548 | 'sproose_crawler','<a href="http://www.sproose.com/bot.html" title="Bot home page [new window]" target="_blank">sproose crawler</a>', |
1549 | 'steroid__download','<a href="http://faqs.org.ru/progr/pascal/delphi_internet2.htm" title="STEROID Download home page [new window]" target="_blank">STEROID Download</a>', |
1550 | 'steeler','<a href="http://www.tkl.iis.u-tokyo.ac.jp/~crawler/ " title="Steeler home page [new window]" target="_blank">Steeler</a>', |
1551 | 'suchfin\-bot','<a href="http://www.suchfin.de/" title="Suchfin-Bot home page [new window]" target="_blank">Suchfin-Bot</a>', |
1552 | 'superbot','<a href="http://www.sparkleware.com/superbot/" title="SuperBot home page [new window]" target="_blank">SuperBot</a>', |
1553 | 'surveybot','SurveyBot', |
1554 | 'susie','<a href="http://www.sync2it.com/bms/susie.php" title="Susie home page [new window]" target="_blank">Susie</a>', |
1555 | 'syndic8','Syndic8', |
1556 | 'syndicapi','<a href="http://syndicapi.com/bot.html" title="Bot home page [new window]" target="_blank">SyndicAPI</a>', |
1557 | 'synoobot','<a href="http://www.synoo.de/bot.html" title="webmaster@synoo.com SynooBot home page [new window]" target="_blank">SynooBot</a>', |
1558 | 'tcl_http_client_package','<a href="http://www.tcl.tk/man/tcl8.4/TclCmd/http.htm" title="Tcl http client package home page [new window]" target="_blank">Tcl http client package</a>', |
1559 | 'technoratibot', 'Technoratibot', |
1560 | 'teragramcrawlersurf','<a href="http://www.teragram.com/" title="TeragramCrawlerSURF home page [new window]" target="_blank">TeragramCrawlerSURF</a>', |
1561 | 'test_crawler','<a href="http://netp.ath.cx/" title="Test Crawler home page [new window]" target="_blank">Test Crawler</a>', |
1562 | 'testbot','<a href="http://www.agbrain.com/" title="TestBot home page [new window]" target="_blank">TestBot</a>', |
1563 | 't\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e','<a href="http://www.thunderstone.com/" title="Bot home page. Used by many. [new window]" target="_blank">T-H-U-N-D-E-R-S-T-O-N-E</a>', |
1564 | 'topicblogs', '<a href="http://www.topicblogs.com/" title="Bot home page [new window]" target="_blank">topicblogs</a>', |
1565 | 'turnitinbot','Turn It In', |
1566 | 'turtle', 'Turtle', |
1567 | 'turtlescanner', 'Turtle', |
1568 | 'tutorgigbot','<a href="http://www.tutorgig.info/" title="TutorGigBot home page [new window]" target="_blank">TutorGigBot</a>', |
1569 | 'twiceler','<a href="http://www.cuill.com/twiceler/robot.html" title="Twiceler home page [new window]" target="_blank">twiceler</a>', |
1570 | 'ubicrawler','<a href="http://law.dsi.unimi.it/ubicrawler/" title="Bot home page [new window]" target="_blank">UbiCrawler</a>', |
1571 | 'ultraseek', 'Ultraseek', |
1572 | 'unchaos_bot_hybrid_web_search_engine','<a href="http://www.unchaos.com/" title="UnChaos Bot Hybrid Web Search Engine home page [new window]" target="_blank">UnChaos Bot Hybrid Web Search Engine</a>', |
1573 | 'unido\-bot','<a href="http://www.unchina.org/unido/unido/our_projects/3_3.html" title="unido-bot home page [new window]" target="_blank">unido-bot</a>', |
1574 | 'updated','<a href="http://www.updated.com/" title="updated home page [new window]" target="_blank">updated</a>', |
1575 | 'ustc\-semantic\-group','<a href="http://ai.ustc.edu.cn/mas/en/research/index.php" title="Bot home page [new window]" target="_blank">USTC-Semantic-Group</a>', |
1576 | 'vagabondo\-wap','<a href="http://www.wise-guys.nl/Contact/index.php?botselected=webagents&lang=uk" title="Bot home page [new window]" target="_blank">Vagabondo-WAP</a>', |
1577 | 'vagabondo','<a href="http://www.wise-guys.nl/Contact/index.php?botselected=webagents&lang=uk" title="Bot home page [new window]" target="_blank">Vagabondo</a>', |
1578 | 'vermut','<a href="http://vermut.aol.com/" title="Bot home page [new window]" target="_blank">Vermut</a>', |
1579 | 'versus_crawler_from_eda\.baykan@epfl\.ch','<a href="http://www.epfl.ch/Eindex.html " title="versus crawler from eda.baykan@epfl.ch home page [new window]" target="_blank">versus crawler from eda.baykan@epfl.ch</a>', |
1580 | 'vespa_crawler','<a href="http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_t_z_030406_1%5Cb" title="Bot home page [new window]" target="_blank">Vespa Crawler</a>', |
1581 | 'vortex','<a href="http://marty.anstey.ca/projects/robots/vortex/" title="Bot home page [new window]" target="_blank">VORTEX</a>', |
1582 | 'vse\/','<a href="http://www.vivisimo.com/" title="VSE home page [new window]" target="_blank">VSE</a>', |
1583 | 'w3c\-checklink','<a href="http://validator.w3.org/checklink/" title="Bot home page [new window]" target="_blank">W3C Link Checker</a>', |
1584 | 'w3c[_+ ]css[_+ ]validator[_+ ]jfouffa', '<a href="http://jigsaw.w3.org/css-validator/" title="Bot home page [new window]" target="_blank">W3C jigsaw CSS Validator</a>', |
1585 | 'w3c_validator','<a href="http://validator.w3.org/" title="Bot home page [new window]" target="_blank">W3C Validator</a>', |
1586 | 'watchmouse', '<a href="http://www.watchmouse.com/en/" title="WatcMouse">WatchMouse Website Monitor</a>', |
1587 | 'wavefire','<a href="http://www.wavefire.com" title="info@wavefire.com; Bot home page [new window]" target="_blank">Wavefire</a>', |
1588 | 'webclipping\.com', 'WebClipping.com', |
1589 | 'webcompass', 'webcompass', |
1590 | 'webcrawl\.net','<a href="http://www.webcrawl.net/" title="webcrawl.net home page [new window]" target="_blank">webcrawl.net</a>', |
1591 | 'web_downloader','<a href="http://www.krasu.ru/soft/chuchelo/" title="Web Downloader home page [new window]" target="_blank">Web Downloader</a>', |
1592 | 'webdup','<a href="http://www.webdup.com/en/index.html" title="Webdup home page [new window]" target="_blank">Webdup</a>', |
1593 | 'webfilter','<a href="http://www.verso.com/enterprise/netspective/webfilter.asp" title="Bot home page [new window]" target="_blank">WebFilter</a>', |
1594 | 'webindexer','<a href="mailto://webindexerv1@yahoo.com" title="WebIndexer home page [new window]" target="_blank">WebIndexer</a>', |
1595 | 'webminer','<a href="http://64.124.122.252/feedback.html" title="WebMiner home page [new window]" target="_blank">WebMiner</a>', |
1596 | 'website[_+ ]monitoring[_+ ]bot','<a href="http://InternetSupervision.com/UrlMonitor/3/" title="Website_Monitoring_Bot home page [new window]" target="_blank">Website_Monitoring_Bot</a>', |
1597 | 'webvulncrawl', 'WebVulnCrawl', |
1598 | 'wells_search','<a href="http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_t_z_1484\b " title="Wells Search home page [new window]" target="_blank">Wells Search</a>', |
1599 | 'wonderer', 'Web Wombat Redback Spider', |
1600 | 'wume_crawler','<a href="http://wume.cse.lehigh.edu/~xiq204/crawler/ " title="wume crawler home page [new window]" target="_blank">wume crawler</a>', |
1601 | 'wwweasel',,'<a href="http://wwweasel.de/" title="Website_Monitoring_Bot home page [new window]" target="_blank">WWWeasel</a>', |
1602 | 'xenu\'s_link_sleuth','<a href="http://home.snafu.de/tilman/xenulink.html" title="Xenu Link Sleuth home page [new window]" target="_blank">Xenu Link Sleuth</a>', |
1603 | 'xenu_link_sleuth','<a href="http://home.snafu.de/tilman/xenulink.html" title="Xenu Link Sleuth home page [new window]" target="_blank">Xenu Link Sleuth</a>', |
1604 | 'xirq','<a href="http://www.xirq.com/" title="xirq home page [new window]" target="_blank">xirq</a>', |
1605 | 'y!j', '<a href="http://help.yahoo.co.jp/help/jp/search/indexing/indexing-15.html" title="Bot home page [new window]" target="_blank">Y!J Yahoo Japan</a>', |
1606 | 'yacy','<a href="http://www.yacy.net/yacy" title="Bot home page [new window]" target="_blank">yacy</a>', |
1607 | 'yahoo\-blogs','<a href="http://help.yahoo.com/help/us/ysearch/crawling/crawling-02.html" title="Bot home page [new window]" target="_blank">Yahoo-Blogs</a>', |
1608 | 'yahoo\-verticalcrawler', 'Yahoo Vertical Crawler', |
1609 | 'yahoofeedseeker', '<a href="http://publisher.yahoo.com/rssguide" title="Bot home page [new window]" target="_blank">Yahoo Feed Seeker</a>', |
1610 | 'yahooseeker\-testing', '<a href="http://search.yahoo.com/" title="Bot home page [new window]" target="_blank">YahooSeeker-Testing</a>', |
1611 | 'yahooseeker', '<a href="http://help.yahoo.com/help/us/ysearch/crawling/crawling-02.html" title="Bot home page [new window]" target="_blank">YahooSeeker Yahoo! Blog crawler</a>', |
1612 | 'yahoo\-mmcrawler', '<a href="mailto:mms-mmcrawler-support@yahoo-inc.com?subject=Yahoo-MMCrawler Information" title="E-mail Bot">Yahoo-MMCrawler</a>', |
1613 | 'yahoo!_mindset','<a href="http://mindset.research.yahoo.com/" title="Bot home page [new window]">Yahoo! Mindset</a>', |
1614 | 'yandex', 'Yandex bot', |
1615 | 'flexum', 'Flexum Search Engine', |
1616 | 'yanga', 'Yanga WorldSearch Bot', |
1617 | 'yooglifetchagent','<a href="http://www.yoogli.com/" title="yoogliFetchAgent home page [new window]" target="_blank">yoogliFetchAgent</a>', |
1618 | 'z\-add_link_checker','<a href="http://w3.z-add.co.uk/linkcheck/" title="Z-Add Link Checker home page [new window]" target="_blank">Z-Add Link Checker</a>', |
1619 | 'zealbot','ZealBot', |
1620 | 'zhuaxia','<a href="http://www.zhuaxia.com/" target="_blank">ZhuaXia</a>', |
1621 | 'zspider','<a href="http://feedback.redkolibri.com/" title="Bot home page [new window]" target="_blank">zspider</a>', |
1622 | 'zeus','<a href="http://www.webmasterworld.com/forum11/1840.htm" title="Bot documentation [new window]" target="_blank">Zeus Webster Pro</a>', |
1623 | 'ng\/1\.','<a href="http://www.exabot.com/" title="Bot home page [new window]" target="_blank">NG 1.x (Exalead)</a>', # put at end to avoid false positive |
1624 | 'ng\/2\.','<a href="http://www.exabot.com/" title="Bot home page [new window]" target="_blank">NG 2.x (Exalead)</a>', # put at end to avoid false positive |
1625 | 'exabot','<a href="http://www.exabot.com/" title="Bot home page [new window]" target="_blank">Exabot</a>', # put at end to avoid false positive |
1626 | # Other id that are 99% of robots |
1627 | 'wget','WGet tools', |
1628 | 'libwww','Perl tool', |
1629 | 'java\/[0-9]','<a href="http://www.projecthoneypot.org/harvester_useragents.php" title="Bot home page [new window]" target="_blank">Java (Often spam bot)</a>', # put at end to avoid false positive |
1630 | # Generic robot |
1631 | 'robot', 'Unknown robot (identified by \'robot\')', |
1632 | 'checker', 'Unknown robot (identified by \'checker\')', |
1633 | 'crawl', 'Unknown robot (identified by \'crawl\')', |
1634 | 'discovery', 'Unknown robot (identified by \'discovery\')', |
1635 | 'hunter', 'Unknown robot (identified by \'hunter\')', |
1636 | 'scanner', 'Unknown robot (identified by \'scanner\')', |
1637 | 'spider', 'Unknown robot (identified by \'spider\')', |
1638 | 'sucker', 'Unknown robot (identified by \'sucker\')', |
1639 | 'bot[\s_+:,\.\;\/\\\-]','Unknown robot (identified by \'bot*\')', |
1640 | '[\s_+:,\.\;\/\\\-]bot','Unknown robot (identified by \'*bot\')', |
1641 | 'no_user_agent','Unknown robot (identified by empty user agent string)', |
1642 | # Unknown robots identified by hit on robots.txt |
1643 | 'unknown', 'Unknown robot (identified by hit on \'robots.txt\')' |
1644 | ); |
1645 | |
1646 | |
1647 | # RobotsAffiliateLib |
1648 | # This list try to tell by which Search Engine a robot is used |
1649 | #------------------------------------------------------------- |
1650 | %RobotsAffiliateLib = ( |
1651 | 'fast\-webcrawler'=>'AllTheWeb', |
1652 | 'googlebot'=>'Google', |
1653 | 'google\-sitemap'=>'Google', |
1654 | 'msnbot'=>'MSN', |
1655 | 'nutch'=>'Looksmart', |
1656 | 'scooter'=>'AltaVista', |
1657 | 'wisenutbot'=>'Looksmart', |
1658 | 'yahoo\-blogs'=>'Yahoo', |
1659 | 'yahoo\-verticalcrawler'=>'Yahoo', |
1660 | 'yahoofeedseeker'=>'Yahoo', |
1661 | 'yahooseeker\-testing'=>'Yahoo', |
1662 | 'yahooseeker'=>'Yahoo', |
1663 | 'yahoo\-mmcrawler'=>'Yahoo', |
1664 | 'yahoo!_mindset'=>'Yahoo', |
1665 | 'zyborg'=>'Looksmart', |
1666 | 'cfetch'=>'Kosmix', |
1667 | '^voyager\/'=>'Kosmix' |
1668 | ); |
1669 | |
1670 | 1; |