diff --git a/make/fixdosfiles.sh b/make/fixdosfiles.sh new file mode 100755 index 000000000..72f4fcaf5 --- /dev/null +++ b/make/fixdosfiles.sh @@ -0,0 +1,31 @@ +#!/bin/sh +#------------------------------------------------------ +# Script to find files that are not Unix encoded +# +# Laurent Destailleur - eldy@users.sourceforge.net +#------------------------------------------------------ +# Usage: fixdosfiles.sh [list|fix] +#------------------------------------------------------ + +# Syntax +if [ "x$1" != "xlist" -a "x$1" != "xfix" ] +then + echo "This script detect or clean files with CR+LF into files with LF only. All source files are included, also files into includes." + echo "Usage: fixdosfiles.sh [list|fix]" +fi + +# To detec +if [ "x$1" = "xlist" ] +then + find . \( -iname "functions" -o -iname "*.md" -o -iname "*.html" -o -iname "*.htm" -o -iname "*.php" -o -iname "*.sh" -o -iname "*.cml" -o -iname "*.css" -o -iname "*.js" -o -iname "*.lang" -o -iname "*.pl" -o -iname "*.txt" -o -iname "*.xml" \) -exec file "{}" + | grep -v '\/test' | grep CRLF +fi + +# To convert +if [ "x$1" = "xfix" ] +then + for fic in `find . \( -iname "functions" -o -iname "*.md" -o -iname "*.html" -o -iname "*.htm" -o -iname "*.php" -o -iname "*.sh" -o -iname "*.cml" -o -iname "*.css" -o -iname "*.js" -o -iname "*.lang" -o -iname "*.pl" -o -iname "*.txt" -o -iname "*.xml" \) -exec file "{}" + | grep -v '\/test' | grep CRLF | awk -F':' '{ print $1 }' ` + do + echo "Fix file $fic" + dos2unix "$fic" + done; +fi diff --git a/make/fixutf8bomfiles.sh b/make/fixutf8bomfiles.sh new file mode 100755 index 000000000..bda503d28 --- /dev/null +++ b/make/fixutf8bomfiles.sh @@ -0,0 +1,40 @@ +#!/bin/sh +# +# Checks of fix files contains UTF-8 BOM in dolibarr source tree, +# excluding git repository, custom modules and included libraries. +# +# Rapha毛l Doursenaud - rdoursenaud@gpcsolutions.fr +# Laurent Destailleur eldy@users.sourceforge.net +#------------------------------------------------------ +# Usage: fixutf8bomfiles.sh [list|fix] +#------------------------------------------------------ + +# Syntax +if [ "x$1" != "xlist" -a "x$1" != "xfix" ] +then + echo "Detect and fix bad UTF8 encoded files (UTF8 must not use BOM char)" + echo "Usage: fixutf8bomfiles.sh (list|fix) [addincludes]" +fi + +if [ "x$2" != "xaddincludes" ] +then + export moreoptions="--exclude-dir='includes'" +fi + +# To detec +if [ "x$1" = "xlist" ] +then + #find . \( -iname '*.php' -print0 -o -iname '*.sh' -print0 -o -iname '*.pl' -print0 -o -iname '*.lang' -print0 -o -iname '*.txt' \) -print0 | xargs -0 awk '/^\xEF\xBB\xBF/ {print FILENAME} {nextfile}' + echo "grep -rlIZ --include='*.php' --include='*.sh' --include='*.pl' --include='*.lang' --include='*.txt' --exclude-dir='.git' --exclude-dir='.tx' $moreoptions --exclude-dir='custom' . . | xargs -0 awk '/^\xEF\xBB\xBF/ {print FILENAME} {nextfile}'" + grep -rlIZ --include='*.php' --include='*.sh' --include='*.pl' --include='*.lang' --include='*.txt' --exclude-dir='.git' --exclude-dir='.tx' $moreoptions --exclude-dir='custom' . . | xargs -0 awk '/^\xEF\xBB\xBF/ {print FILENAME} {nextfile}' +fi + +# To convert +if [ "x$1" = "xfix" ] +then + for fic in `grep -rlIZ --include='*.php' --include='*.sh' --include='*.pl' --include='*.lang' --include='*.txt' --exclude-dir='.git' --exclude-dir='.tx' $moreoptions --exclude-dir='custom' . . | xargs -0 awk '/^\xEF\xBB\xBF/ {print FILENAME} {nextfile}'` + do + echo "Fixing $fic" + sed -i '1s/^\xEF\xBB\xBF//' $fic + done; +fi diff --git a/wwwroot/cgi-bin/awdownloadcsv.pl b/wwwroot/cgi-bin/awdownloadcsv.pl index c43acaa72..eaa61ad14 100755 --- a/wwwroot/cgi-bin/awdownloadcsv.pl +++ b/wwwroot/cgi-bin/awdownloadcsv.pl @@ -1,152 +1,152 @@ -#!/usr/bin/perl -w -#------------------------------------------------------------------------------ -# Free addition to AWStats Web Log Analyzer. Used to export the contents of -# sections of the Apache server log database to CSV for use in other tools. -# Works from command line or as a CGI. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -#------------------------------------------------------------------------------ -use CGI qw(:standard); - -my $ALLOWDOWNLOAD=0; - -# Disabled by default for security reason -if (! $ALLOWDOWNLOAD) -{ - print("Error: You must first edit script to change ALLOWDOWNLOAD to 1 to allow usage of this script.\n"); - print("Reason is that enabling this script may be a security hole as it allows someone to download/view details of your awstats data files.\n"); - exit; -} - -my $q = new CGI; -my $outputFile = ""; # used to write the output to a file -my $inputFile = ""; # the fully qualified path to the input log database file -my $sectionToReport = ""; # contains the tag to search for in the database file -my $startSearchStr = "BEGIN_"; -my $endSearchStr = "END_"; -my $startPrinting = 0; # flag to indicate that the start tag has been found -my $attachFileName = ""; - -# These parameters are used to build the input file name of the awstats log database -my $baseName = ""; -my $month = ""; -my $year = ""; -my $day = ""; -my $siteConfig = ""; - -if ($q->param("outputFile")) { - if ($outputFile eq '') { $outputFile = $q->param("outputFile"); } -} - -if ($q->param("inputFile")) { - if ($inputFile eq '') { $inputFile = $q->param("inputFile"); } -} - -if ($q->param("section")) { - if ($sectionToReport eq '' ) { $sectionToReport = $q->param("section"); } -} - -if ($q->param("baseName")) { - if ($baseName eq '' ) { $baseName = $q->param("baseName"); } -} - -if ($q->param("month")) { - if ($month eq '' ) { $month = $q->param("month"); } -} - -if ($q->param("year")) { - if ($year eq '' ) { $year = $q->param("year"); } -} - -if ($q->param("day")) { $day = $q->param("day"); } - -if ($q->param("siteConfig")) { - if ($siteConfig eq '' ) { $siteConfig = $q->param("siteConfig"); } -} - -# set the attachment file name to the report section -if ($sectionToReport ne '' ) { - $attachFileName = $sectionToReport . ".csv"; -} else { - $attachFileName = "exportCSV.csv"; -} -print $q->header(-type=> "application/force-download", -attachment=>$attachFileName); - -# Build the start/end search tags -$startSearchStr = $startSearchStr . $sectionToReport; -$endSearchStr = $endSearchStr . $sectionToReport; - -if ( !$inputFile ) { $inputFile ="$baseName$month$year$day.$siteConfig.txt" }; - -open (IN, $inputFile) || die "cannot open $inputFile\n"; - -# If there's a parameter for the output, open it here -if ($outputFile ne '') { - open (OUT,">$outputFile") || die "cannot create $outputFile\n"; - flock (OUT, 2); -} -# Loop through the input file searching for the start string. When -# found, start displaying the input lines (with spaces changed -# to commas) until the end tag is found. - -# Array to store comments for printing once we hit the desired section -my $commentCount = -1; -my %commentArray; - -while () { - chomp; - - if (/^#\s(.*-)\s/){ # search for comment lines - s/ - /,/g; # replace dashes with commas - s/#//; # get rid of the comment sign - $commentArray[++$commentCount] = $_; - } - - # put the test to end printing here to eliminate printing - # the line with the END tag - if (/^$endSearchStr\b/) { - $startPrinting = 0; - } - - if ($startPrinting) { - s/ /,/g; - print "$_\n"; - if ($outputFile ne '') { - print OUT "$_\n"; - } - } - # if we find an END tag and we haven't started printing, reset the - # comment array to start re-capturing comments for next section - if ((/^END_/) && ($startPrinting == 0)) { - $commentCount = -1; - } - - # put the start printing test after the first input line - # to eliminate printing the line with the BEGIN tag...find it - # here, then start printing on the next input line - if (/^$startSearchStr\b/) { - $startPrinting = 1; - # print the comment array - it provides labels for the columns - for ($i = 0; $i <= $commentCount; $i++ ) { - print "$commentArray[$i]\n"; - } - } -} - -close(IN); - -# Close the output file if there was one used -if ($outputFile ne '') { - close(OUT); -} +#!/usr/bin/perl -w +#------------------------------------------------------------------------------ +# Free addition to AWStats Web Log Analyzer. Used to export the contents of +# sections of the Apache server log database to CSV for use in other tools. +# Works from command line or as a CGI. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +#------------------------------------------------------------------------------ +use CGI qw(:standard); + +my $ALLOWDOWNLOAD=0; + +# Disabled by default for security reason +if (! $ALLOWDOWNLOAD) +{ + print("Error: You must first edit script to change ALLOWDOWNLOAD to 1 to allow usage of this script.\n"); + print("Reason is that enabling this script may be a security hole as it allows someone to download/view details of your awstats data files.\n"); + exit; +} + +my $q = new CGI; +my $outputFile = ""; # used to write the output to a file +my $inputFile = ""; # the fully qualified path to the input log database file +my $sectionToReport = ""; # contains the tag to search for in the database file +my $startSearchStr = "BEGIN_"; +my $endSearchStr = "END_"; +my $startPrinting = 0; # flag to indicate that the start tag has been found +my $attachFileName = ""; + +# These parameters are used to build the input file name of the awstats log database +my $baseName = ""; +my $month = ""; +my $year = ""; +my $day = ""; +my $siteConfig = ""; + +if ($q->param("outputFile")) { + if ($outputFile eq '') { $outputFile = $q->param("outputFile"); } +} + +if ($q->param("inputFile")) { + if ($inputFile eq '') { $inputFile = $q->param("inputFile"); } +} + +if ($q->param("section")) { + if ($sectionToReport eq '' ) { $sectionToReport = $q->param("section"); } +} + +if ($q->param("baseName")) { + if ($baseName eq '' ) { $baseName = $q->param("baseName"); } +} + +if ($q->param("month")) { + if ($month eq '' ) { $month = $q->param("month"); } +} + +if ($q->param("year")) { + if ($year eq '' ) { $year = $q->param("year"); } +} + +if ($q->param("day")) { $day = $q->param("day"); } + +if ($q->param("siteConfig")) { + if ($siteConfig eq '' ) { $siteConfig = $q->param("siteConfig"); } +} + +# set the attachment file name to the report section +if ($sectionToReport ne '' ) { + $attachFileName = $sectionToReport . ".csv"; +} else { + $attachFileName = "exportCSV.csv"; +} +print $q->header(-type=> "application/force-download", -attachment=>$attachFileName); + +# Build the start/end search tags +$startSearchStr = $startSearchStr . $sectionToReport; +$endSearchStr = $endSearchStr . $sectionToReport; + +if ( !$inputFile ) { $inputFile ="$baseName$month$year$day.$siteConfig.txt" }; + +open (IN, $inputFile) || die "cannot open $inputFile\n"; + +# If there's a parameter for the output, open it here +if ($outputFile ne '') { + open (OUT,">$outputFile") || die "cannot create $outputFile\n"; + flock (OUT, 2); +} +# Loop through the input file searching for the start string. When +# found, start displaying the input lines (with spaces changed +# to commas) until the end tag is found. + +# Array to store comments for printing once we hit the desired section +my $commentCount = -1; +my %commentArray; + +while () { + chomp; + + if (/^#\s(.*-)\s/){ # search for comment lines + s/ - /,/g; # replace dashes with commas + s/#//; # get rid of the comment sign + $commentArray[++$commentCount] = $_; + } + + # put the test to end printing here to eliminate printing + # the line with the END tag + if (/^$endSearchStr\b/) { + $startPrinting = 0; + } + + if ($startPrinting) { + s/ /,/g; + print "$_\n"; + if ($outputFile ne '') { + print OUT "$_\n"; + } + } + # if we find an END tag and we haven't started printing, reset the + # comment array to start re-capturing comments for next section + if ((/^END_/) && ($startPrinting == 0)) { + $commentCount = -1; + } + + # put the start printing test after the first input line + # to eliminate printing the line with the BEGIN tag...find it + # here, then start printing on the next input line + if (/^$startSearchStr\b/) { + $startPrinting = 1; + # print the comment array - it provides labels for the columns + for ($i = 0; $i <= $commentCount; $i++ ) { + print "$commentArray[$i]\n"; + } + } +} + +close(IN); + +# Close the output file if there was one used +if ($outputFile ne '') { + close(OUT); +} diff --git a/wwwroot/cgi-bin/lang/awstats-cn.txt b/wwwroot/cgi-bin/lang/awstats-cn.txt index cb6e7bc08..0e797f7e1 100644 --- a/wwwroot/cgi-bin/lang/awstats-cn.txt +++ b/wwwroot/cgi-bin/lang/awstats-cn.txt @@ -1,182 +1,182 @@ -# Chinese (simplified) message file (by Che Dong chedongATgmail.com) -# $Revision$ - $Date$ -PageCode=GBK -message0=无法得知 -message1=无法得知(不能反向解析的网域名称) -message2=其他 -message3=查看详细资料 -message4=日期 -message5=月 -message6=年 -message7=统计网站 -message8=首次参观日期 -message9=最近参观日期 -message10=参观人次 -message11=参观者 -message12=参观 -message13=个关键字词 -message14=搜索 -message15=百分比 -message16=流量统计 -message17=网域或国家 -message18=参观者 -message19=URL 网址 -message20=每小时浏览次数 -message21=浏览器 -message22=HTTP 错误 -message23=反相链接 -message24=从未更新(请参考 awstats_setup.html上的 'Build/Update') -message25=参观者的网域或国家 -message26=主机数 -message27=网页数 -message28=个不同的网页 -message29=存取次数 -message30=不同的字词 -message31=找不到的网页 -message32=HTTP 错误码 -message33=Netscape 版本 -message34=IE 版本 -message35=最近更新 -message36=链接网站的方法 -message37=来源网址 -message38=网址由参观者自行输入或从书签取出 -message39=无法得知连结的方法 -message40=来自搜索引擎 -message41=来自此网站外的其他网页 (非搜索引擎) -message42=从网站内部连结 -message43=网站搜索的关键字句 -message44=网站搜索的关键字词 -message45=无法反解译的IP地址 -message46=无法得知的操作系统 -message47=找不到的网址链接 (HTTP 错误码 404) -message48=IP 地址 -message49=错误次数 -message50=无法得知的浏览器 -message51=个机器人 -message52=参观人次/参观者 -message53=搜索引擎网站的机器人 -message54=网页纪录分析系统 -message55=个於 -message56=网页数 -message57=文件数 -message58=版本 -message59=操作系统 -message60=01月 -message61=02月 -message62=03月 -message63=04月 -message64=05月 -message65=06月 -message66=07月 -message67=08月 -message68=09月 -message69=10月 -message70=11月 -message71=12月 -message72=浏览器统计 -message73=文件类别 -message74=立即更新 -message75=字节 -message76=回到主页 -message77=前 -message78=yyyy年mm月dd日 HH:MM -message79=过滤包含 -message80=全部列出 -message81=主机 -message82=个解译成功 -message83=搜索引擎网站 -message84=日 -message85=一 -message86=二 -message87=三 -message88=四 -message89=五 -message90=六 -message91=按星期 -message92=按参观者 -message93=按参观时间 -message94=鉴别出的用户 -message95=最小 -message96=平均数 -message97=最大 -message98=网页压缩 -message99=节省了的带宽 -message100=压缩前 -message101=压缩后 -message102=总数 -message103=个不同的关键字句 -message104=入站处 -message105=编码 -message106=平均大小 -message107=从新闻群组链接 -message108=KB -message109=MB -message110=GB -message111=离线浏览器(网页抓取) -message112=是 -message113=否 -message114=Whois 信息 -message115=OK -message116=出站处 -message117=每次参观所花时间 -message118=关闭此窗口 -message119=Bytes -message120=用以搜索的短语 -message121=用以搜索的关键词 -message122=个不同的搜索引擎转介参观者到这站 -message123=个不同的其他网站转介参观者到这站 -message124=其他短语 -message125=其他登录 (包括匿名登录) -message126=由那些搜索引擎转介 -message127=由那些其他网站转介 -message128=摘要 -message129=作全年统计时,无法准确得知参观者的数目 -message130=数据值数组 -message131=发信人邮址 -message132=收信人邮件地址 -message133=报表日期 -message134=特别/市场 -message135=屏幕分辨率 -message136=蠕虫/病毒 攻击 -message137=加入到收藏夹(估计) -message138=按日期统计 -message139=其他 -message140=浏览器支持 Java -message141=浏览器支持 Macromedia Director -message142=浏览器支持 Flash -message143=浏览器支持 Real audio 播放 -message144=浏览器支持 Quicktime audio 播放 -message145=浏览器支持 Windows Media audio 播放 -message146=浏览器支持 PDF -message147=SMTP错误代码 -message148=国家或地区 -message149=邮件 -message150=大小 -message151=第一个 -message152=最末一个 -message153=过滤不包含 -message154=非浏览器产生的流量(来自搜索引擎机器人,病毒蠕虫等) -message155=集群 -message156=以上列出的搜索引擎机器人产生的“非浏览器”流量并未包含在其他图表中 -message157=“+”后的数字为成功的“robots.txt”访问次数 -message158=以上列出的蠕虫产生的“非浏览器”流量并未包含在其他图表中 -message159=非浏览的流量包括搜索引擎机器人,蠕虫病毒产生的流量和非正常的HTTP相应 -message160=浏览器流量 -message161=非浏览器流量 -message162=按月历史统计 -message163=蠕虫 -message164=不同的蠕虫 -message165=成功发送邮件 -message166=邮件失败或拒收 -message167=敏感目标 -message168=Javascript禁用 -message169=创建者 -message170=插件 -message171=地区 -message172=城市 -message173=Opera 版本 -message174=Safari 版本 -message175=Chrome 版本 -message176=Konqueror 版本 -message177=, -message178=下载 +# Chinese (simplified) message file (by Che Dong chedongATgmail.com) +# $Revision$ - $Date$ +PageCode=GBK +message0=无法得知 +message1=无法得知(不能反向解析的网域名称) +message2=其他 +message3=查看详细资料 +message4=日期 +message5=月 +message6=年 +message7=统计网站 +message8=首次参观日期 +message9=最近参观日期 +message10=参观人次 +message11=参观者 +message12=参观 +message13=个关键字词 +message14=搜索 +message15=百分比 +message16=流量统计 +message17=网域或国家 +message18=参观者 +message19=URL 网址 +message20=每小时浏览次数 +message21=浏览器 +message22=HTTP 错误 +message23=反相链接 +message24=从未更新(请参考 awstats_setup.html上的 'Build/Update') +message25=参观者的网域或国家 +message26=主机数 +message27=网页数 +message28=个不同的网页 +message29=存取次数 +message30=不同的字词 +message31=找不到的网页 +message32=HTTP 错误码 +message33=Netscape 版本 +message34=IE 版本 +message35=最近更新 +message36=链接网站的方法 +message37=来源网址 +message38=网址由参观者自行输入或从书签取出 +message39=无法得知连结的方法 +message40=来自搜索引擎 +message41=来自此网站外的其他网页 (非搜索引擎) +message42=从网站内部连结 +message43=网站搜索的关键字句 +message44=网站搜索的关键字词 +message45=无法反解译的IP地址 +message46=无法得知的操作系统 +message47=找不到的网址链接 (HTTP 错误码 404) +message48=IP 地址 +message49=错误次数 +message50=无法得知的浏览器 +message51=个机器人 +message52=参观人次/参观者 +message53=搜索引擎网站的机器人 +message54=网页纪录分析系统 +message55=个於 +message56=网页数 +message57=文件数 +message58=版本 +message59=操作系统 +message60=01月 +message61=02月 +message62=03月 +message63=04月 +message64=05月 +message65=06月 +message66=07月 +message67=08月 +message68=09月 +message69=10月 +message70=11月 +message71=12月 +message72=浏览器统计 +message73=文件类别 +message74=立即更新 +message75=字节 +message76=回到主页 +message77=前 +message78=yyyy年mm月dd日 HH:MM +message79=过滤包含 +message80=全部列出 +message81=主机 +message82=个解译成功 +message83=搜索引擎网站 +message84=日 +message85=一 +message86=二 +message87=三 +message88=四 +message89=五 +message90=六 +message91=按星期 +message92=按参观者 +message93=按参观时间 +message94=鉴别出的用户 +message95=最小 +message96=平均数 +message97=最大 +message98=网页压缩 +message99=节省了的带宽 +message100=压缩前 +message101=压缩后 +message102=总数 +message103=个不同的关键字句 +message104=入站处 +message105=编码 +message106=平均大小 +message107=从新闻群组链接 +message108=KB +message109=MB +message110=GB +message111=离线浏览器(网页抓取) +message112=是 +message113=否 +message114=Whois 信息 +message115=OK +message116=出站处 +message117=每次参观所花时间 +message118=关闭此窗口 +message119=Bytes +message120=用以搜索的短语 +message121=用以搜索的关键词 +message122=个不同的搜索引擎转介参观者到这站 +message123=个不同的其他网站转介参观者到这站 +message124=其他短语 +message125=其他登录 (包括匿名登录) +message126=由那些搜索引擎转介 +message127=由那些其他网站转介 +message128=摘要 +message129=作全年统计时,无法准确得知参观者的数目 +message130=数据值数组 +message131=发信人邮址 +message132=收信人邮件地址 +message133=报表日期 +message134=特别/市场 +message135=屏幕分辨率 +message136=蠕虫/病毒 攻击 +message137=加入到收藏夹(估计) +message138=按日期统计 +message139=其他 +message140=浏览器支持 Java +message141=浏览器支持 Macromedia Director +message142=浏览器支持 Flash +message143=浏览器支持 Real audio 播放 +message144=浏览器支持 Quicktime audio 播放 +message145=浏览器支持 Windows Media audio 播放 +message146=浏览器支持 PDF +message147=SMTP错误代码 +message148=国家或地区 +message149=邮件 +message150=大小 +message151=第一个 +message152=最末一个 +message153=过滤不包含 +message154=非浏览器产生的流量(来自搜索引擎机器人,病毒蠕虫等) +message155=集群 +message156=以上列出的搜索引擎机器人产生的“非浏览器”流量并未包含在其他图表中 +message157=“+”后的数字为成功的“robots.txt”访问次数 +message158=以上列出的蠕虫产生的“非浏览器”流量并未包含在其他图表中 +message159=非浏览的流量包括搜索引擎机器人,蠕虫病毒产生的流量和非正常的HTTP相应 +message160=浏览器流量 +message161=非浏览器流量 +message162=按月历史统计 +message163=蠕虫 +message164=不同的蠕虫 +message165=成功发送邮件 +message166=邮件失败或拒收 +message167=敏感目标 +message168=Javascript禁用 +message169=创建者 +message170=插件 +message171=地区 +message172=城市 +message173=Opera 版本 +message174=Safari 版本 +message175=Chrome 版本 +message176=Konqueror 版本 +message177=, +message178=下载 diff --git a/wwwroot/cgi-bin/lang/awstats-lv.txt b/wwwroot/cgi-bin/lang/awstats-lv.txt index 9f3985ae5..fce13900b 100644 --- a/wwwroot/cgi-bin/lang/awstats-lv.txt +++ b/wwwroot/cgi-bin/lang/awstats-lv.txt @@ -1,178 +1,178 @@ -锘# Latvie拧u valodas zi艈ojumu fails (madmaster@gobbo.caves.lv) -# Updated by edvinsma@inbox.lv 2004/01/24 00:40:00 -# $Revision$ - $Date$ -PageCode=utf-8 -message0=Nezin膩ms -message1=Nezin膩ms (neatpaz墨ts ip) -message2=Citi -message3=Apskat墨t izv膿rsti -message4=Diena -message5=M膿nesis -message6=Gads -message7=Statistika -message8=Pirmais apmekl膿jums -message9=P膿d膿jais apmekl膿jums -message10=Viz墨拧u skaits -message11=Unik膩lie apmekl膿t膩ji -message12=Apmekl膿jums -message13=at拧姆ir墨gi(s) atsl膿gv膩rdi(s) -message14=Mekl膿t -message15=Procenti -message16=Trafiks -message17=Domaini/Valstis -message18=Apmekl膿t膩ji -message19=Lapas-URL -message20=Stundas -message21=P膩rl奴kprogrammas -message22=HTTP K募奴das -message23=Nor膩d墨t膩ji -message24=Mekl膿t Atsl膿gv膩rdus -message25=Apmekl膿t膩ju domaini/valstis -message26=hosti -message27=lapas -message28=at拧姆ir墨gas lapas -message29=Skat墨tas lapas -message30=Citi v膩rdi -message31=Neatrastas lapas -message32=HTTP K募奴du kodi -message33=Netscape versijas -message34=IE versijas -message35=P膿d膿jais jaunin膩jums -message36=Pievienoties saitei no -message37=Ori模in膩li -message38=Tie拧膩 adrese / Gr膩matz墨mes -message39=Or模in膩ls nezin膩ms -message40=Nor膩des no Interneta Mekl膿拧anas Sait膿m -message41=Nor膩des no 膩r膿j膩m lap膩m (citas web lapas iz艈emot mekl膿拧anas saites) -message42=Links from an internal page (cita lapa 拧aj膩 pa拧膩 sait膿) -message43=Atsl膿gv膩rdi kas lietoti mekl膿拧anas sait膿s -message44=Kb -message45=Neatpaz墨tas IP Addreses -message46=Nezin膩ms OS (Nor膩des Lauks) -message47=Piepras墨ts bet neatrasts URLs (HTTP kods 404) -message48=IP Addrese -message49=K募uda Tr膩p墨jumi -message50=Nezin膩mi p膩rl奴ki (Nor膩des lauks) -message51=Apmekl膿ju拧ie roboti -message52=apmekl膿jumi/apmekl膿t膩ji -message53=Roboti/Zirnek募i apmekl膿t膩ji -message54=Br墨vs re膩l膩 laika logfailu analizators advanc膿tai web statistikai -message55=no -message56=Lapas -message57=Tr膩p墨jumi -message58=Versijas -message59=Oper膿t膩jsist膿mas -message60=Jan -message61=Feb -message62=Mar -message63=Apr -message64=Mai -message65=J奴n -message66=J奴l -message67=Aug -message68=Sep -message69=Okt -message70=Nov -message71=Dec -message72=Navig膩cija -message73=Failu tips -message74=Atjaunot -message75=Baiti -message76=Atpaka募 uz galveno lapu -message77=Aug拧a -message78=dd mmm yyyy - HH:MM -message79=Filtrs -message80=Pilns saraksts -message81=Hosti -message82=Zin膩ms -message83=Roboti -message84=Sv -message85=Pir -message86=Ot -message87=Tr -message88=Ce -message89=Pkt -message90=Se -message91=Ned膿募as dienas -message92=Kas -message93=Kad -message94=Autentific膿tie lietot膩ji -message95=Min -message96=Vid -message97=Maks -message98=Web sal墨dzin膩jums -message99=saglab膩tais joslas platums -message100=Pirms kompresijas -message101=P膿c kompresijas -message102=Kop膩 -message103=At拧姆ir墨gi atsl膿gv膩rdi -message104=Iejas lapas -message105=Kods -message106=Vid膿jais izm膿rs -message107=Saites no Zi艈u grup膩m -message108=KB -message109=MB -message110=GB -message111=Sav膩c膿js -message112=J膩 -message113=N膿 -message114=WhoIs inform膩cija -message115=OK -message116=Izejas pages -message117=Apmekl膿juma ilgums -message118=Aizv膿rt logu -message119=Baiti -message120=Mekl膿拧anas atsl膿gfr膩zes -message121=Mekl膿拧anas atsl膿gv膩rdi -message122=Citas mekl膿t膩ju lapas ar atsauc膿m -message123=Citas lapas ar atsauc膿m -message124=Citas fr膩zes -message125=Anon墨mie lietot膩ji -message126=Mekl膿t膩ju lapas ar atsauc膿m -message127=Lapas ar atsauc膿m -message128=Kopsavilkums -message129=Prec墨za v膿rt墨ba sada募膩 "Gads" nav pieejama -message130=Datu v膿r墨bu kopnes -message131=S奴t墨t膩ja adrese -message132=Sa艈膿m膿ja adrese -message133=Atskaites periods -message134=Papildus/M膩rketings -message135=Ekr膩na iz拧姆ir拧anas sp膿ja -message136=V墨rusu uzbrukumi -message137=Pievienots izlasei -message138=M膿ne拧a dienas -message139=Da啪膩di -message140=P膩rl奴kprogrammas ar Java atbalstu -message141=P膩rl奴kprogrammas ar Macromedia Director atbalstu -message142=P膩rl奴kprogrammas ar Flash atbalstu -message143=P膩rl奴kprogrammas ar RealAudio atbalstu -message144=P膩rl奴kprogrammas ar QuickTime atbalstu -message145=P膩rl奴kprogrammas ar Windows Media atbalstu -message146=P膩rl奴kprogrammas ar PDF atbalstu -message147=SMTP k募奴du kodi -message148=Valstis -message149=E-pasti -message150=Izm膿rs -message151=S膩kums -message152=Beigas -message153=Izsl膿g拧anas filtrs -message154=艩eit kodi par膩da 拧膩vienus vai trafiku, ko nav apskat墨ju拧i lietot膩ji, t膩p膿c vi艈i nav iek募auti cit膩s diagramm膩s. -message155=Puduris -message156=艩eit uzr膩d墨tie roboti ir rad墨ju拧i tr膩pijumus vai "nepskat墨to" trafiku, t膩p膿c tie nav iek募auti cit膩s diagramm膩s. -message157=Skaitlis p膿c "+" ir veiksm墨go 拧膩vienu skaits robots.txt failam. -message158=艩ie ir uzr膩d墨ti tr膩pijumi vai trafiks ko rad墨ja t墨kla t膩rpi vai ar墨 "neapskat墨t膩s" lapas, t膩p膿c tie nav iek募auti cit膩s -diagramm膩s. -message159="Neapskat墨to" trafiku 模ener膿 roboti, t墨kla t膩rpi, vai ar墨 atbildes ar specialo HTTP statusa kodu. -message160=Apskat墨ts trafiks -message161=Nav apskat墨ts trafiks -message162=M膿ne拧a atskaite -message163=T墨kla t膩rpi -message164=Da啪膩di t墨kla t膩rpi -message165=Veiksm墨gi nos奴t墨ti e-pasti -message166=Neveiksm墨gas e-pasta s奴t墨拧anas -message167=Ievainojam墨ba -message168=Atsl膿gtsw Javascript -message169=Izveidojis -message170=spraud艈i -message171=Re模ioni +# Latvie拧u valodas zi艈ojumu fails (madmaster@gobbo.caves.lv) +# Updated by edvinsma@inbox.lv 2004/01/24 00:40:00 +# $Revision$ - $Date$ +PageCode=utf-8 +message0=Nezin膩ms +message1=Nezin膩ms (neatpaz墨ts ip) +message2=Citi +message3=Apskat墨t izv膿rsti +message4=Diena +message5=M膿nesis +message6=Gads +message7=Statistika +message8=Pirmais apmekl膿jums +message9=P膿d膿jais apmekl膿jums +message10=Viz墨拧u skaits +message11=Unik膩lie apmekl膿t膩ji +message12=Apmekl膿jums +message13=at拧姆ir墨gi(s) atsl膿gv膩rdi(s) +message14=Mekl膿t +message15=Procenti +message16=Trafiks +message17=Domaini/Valstis +message18=Apmekl膿t膩ji +message19=Lapas-URL +message20=Stundas +message21=P膩rl奴kprogrammas +message22=HTTP K募奴das +message23=Nor膩d墨t膩ji +message24=Mekl膿t Atsl膿gv膩rdus +message25=Apmekl膿t膩ju domaini/valstis +message26=hosti +message27=lapas +message28=at拧姆ir墨gas lapas +message29=Skat墨tas lapas +message30=Citi v膩rdi +message31=Neatrastas lapas +message32=HTTP K募奴du kodi +message33=Netscape versijas +message34=IE versijas +message35=P膿d膿jais jaunin膩jums +message36=Pievienoties saitei no +message37=Ori模in膩li +message38=Tie拧膩 adrese / Gr膩matz墨mes +message39=Or模in膩ls nezin膩ms +message40=Nor膩des no Interneta Mekl膿拧anas Sait膿m +message41=Nor膩des no 膩r膿j膩m lap膩m (citas web lapas iz艈emot mekl膿拧anas saites) +message42=Links from an internal page (cita lapa 拧aj膩 pa拧膩 sait膿) +message43=Atsl膿gv膩rdi kas lietoti mekl膿拧anas sait膿s +message44=Kb +message45=Neatpaz墨tas IP Addreses +message46=Nezin膩ms OS (Nor膩des Lauks) +message47=Piepras墨ts bet neatrasts URLs (HTTP kods 404) +message48=IP Addrese +message49=K募uda Tr膩p墨jumi +message50=Nezin膩mi p膩rl奴ki (Nor膩des lauks) +message51=Apmekl膿ju拧ie roboti +message52=apmekl膿jumi/apmekl膿t膩ji +message53=Roboti/Zirnek募i apmekl膿t膩ji +message54=Br墨vs re膩l膩 laika logfailu analizators advanc膿tai web statistikai +message55=no +message56=Lapas +message57=Tr膩p墨jumi +message58=Versijas +message59=Oper膿t膩jsist膿mas +message60=Jan +message61=Feb +message62=Mar +message63=Apr +message64=Mai +message65=J奴n +message66=J奴l +message67=Aug +message68=Sep +message69=Okt +message70=Nov +message71=Dec +message72=Navig膩cija +message73=Failu tips +message74=Atjaunot +message75=Baiti +message76=Atpaka募 uz galveno lapu +message77=Aug拧a +message78=dd mmm yyyy - HH:MM +message79=Filtrs +message80=Pilns saraksts +message81=Hosti +message82=Zin膩ms +message83=Roboti +message84=Sv +message85=Pir +message86=Ot +message87=Tr +message88=Ce +message89=Pkt +message90=Se +message91=Ned膿募as dienas +message92=Kas +message93=Kad +message94=Autentific膿tie lietot膩ji +message95=Min +message96=Vid +message97=Maks +message98=Web sal墨dzin膩jums +message99=saglab膩tais joslas platums +message100=Pirms kompresijas +message101=P膿c kompresijas +message102=Kop膩 +message103=At拧姆ir墨gi atsl膿gv膩rdi +message104=Iejas lapas +message105=Kods +message106=Vid膿jais izm膿rs +message107=Saites no Zi艈u grup膩m +message108=KB +message109=MB +message110=GB +message111=Sav膩c膿js +message112=J膩 +message113=N膿 +message114=WhoIs inform膩cija +message115=OK +message116=Izejas pages +message117=Apmekl膿juma ilgums +message118=Aizv膿rt logu +message119=Baiti +message120=Mekl膿拧anas atsl膿gfr膩zes +message121=Mekl膿拧anas atsl膿gv膩rdi +message122=Citas mekl膿t膩ju lapas ar atsauc膿m +message123=Citas lapas ar atsauc膿m +message124=Citas fr膩zes +message125=Anon墨mie lietot膩ji +message126=Mekl膿t膩ju lapas ar atsauc膿m +message127=Lapas ar atsauc膿m +message128=Kopsavilkums +message129=Prec墨za v膿rt墨ba sada募膩 "Gads" nav pieejama +message130=Datu v膿r墨bu kopnes +message131=S奴t墨t膩ja adrese +message132=Sa艈膿m膿ja adrese +message133=Atskaites periods +message134=Papildus/M膩rketings +message135=Ekr膩na iz拧姆ir拧anas sp膿ja +message136=V墨rusu uzbrukumi +message137=Pievienots izlasei +message138=M膿ne拧a dienas +message139=Da啪膩di +message140=P膩rl奴kprogrammas ar Java atbalstu +message141=P膩rl奴kprogrammas ar Macromedia Director atbalstu +message142=P膩rl奴kprogrammas ar Flash atbalstu +message143=P膩rl奴kprogrammas ar RealAudio atbalstu +message144=P膩rl奴kprogrammas ar QuickTime atbalstu +message145=P膩rl奴kprogrammas ar Windows Media atbalstu +message146=P膩rl奴kprogrammas ar PDF atbalstu +message147=SMTP k募奴du kodi +message148=Valstis +message149=E-pasti +message150=Izm膿rs +message151=S膩kums +message152=Beigas +message153=Izsl膿g拧anas filtrs +message154=艩eit kodi par膩da 拧膩vienus vai trafiku, ko nav apskat墨ju拧i lietot膩ji, t膩p膿c vi艈i nav iek募auti cit膩s diagramm膩s. +message155=Puduris +message156=艩eit uzr膩d墨tie roboti ir rad墨ju拧i tr膩pijumus vai "nepskat墨to" trafiku, t膩p膿c tie nav iek募auti cit膩s diagramm膩s. +message157=Skaitlis p膿c "+" ir veiksm墨go 拧膩vienu skaits robots.txt failam. +message158=艩ie ir uzr膩d墨ti tr膩pijumi vai trafiks ko rad墨ja t墨kla t膩rpi vai ar墨 "neapskat墨t膩s" lapas, t膩p膿c tie nav iek募auti cit膩s +diagramm膩s. +message159="Neapskat墨to" trafiku 模ener膿 roboti, t墨kla t膩rpi, vai ar墨 atbildes ar specialo HTTP statusa kodu. +message160=Apskat墨ts trafiks +message161=Nav apskat墨ts trafiks +message162=M膿ne拧a atskaite +message163=T墨kla t膩rpi +message164=Da啪膩di t墨kla t膩rpi +message165=Veiksm墨gi nos奴t墨ti e-pasti +message166=Neveiksm墨gas e-pasta s奴t墨拧anas +message167=Ievainojam墨ba +message168=Atsl膿gtsw Javascript +message169=Izveidojis +message170=spraud艈i +message171=Re模ioni message172=Pils膿tas \ No newline at end of file diff --git a/wwwroot/cgi-bin/lib/robots.pm b/wwwroot/cgi-bin/lib/robots.pm index c443f66bd..f6124146f 100644 --- a/wwwroot/cgi-bin/lib/robots.pm +++ b/wwwroot/cgi-bin/lib/robots.pm @@ -1,2219 +1,2219 @@ -# AWSTATS ROBOTS DATABASE -#------------------------------------------------------- -# If you want to add robots to extend AWStats database detection capabilities, -# you must add an entry in RobotsSearchIDOrder_listx and RobotsHashIDLib. - -# The entry in RobotsSearchIDOrder_listx is a Perl regular expression -# (see http://perldoc.perl.org/perlreref.html). AWSTats applies these -# expressions to the user agent string in the order given by the lists. The -# first match specifies the robot. -# -# Note: This regular expression must not contain any whitespace. -# Otherwise AWStats will produce lines in the database that -# will be misinterpreted and as a consequence the corresponding data in the -# generated HTML reports will be wrong. If you want to match whitespace in -# the user agent string, use other constructs like '\s', '[:blank:]', -# '\p{IsSpace}', '\x20' etc. -# -# The corresponding entry in RobotsHashIDLib contains the regular expression -# as key, followed by a string containing HTML-text. AWStats inserts this -# text into reports to describe the bot. If possible the text should contain -# a link to the bot home page. This makes it easier for sysadmins to find -# the information necessary e.g. to adapt the robots.txt file. -# -# An entry in the RobotsAffiliateLib is not necessary. An entry in this list -# contains as first part the regular expression specifying the bot. The -# second part is a string that gives the Company or product managing the bot. -# This information is not used yet. -# -# There are several sorts of bots that AWStats is not able to detect and -# therefore a considerable amount of bot generated traffic counts -# as user traffic: -# -# a) A crawler that identifies itself in the referrer string, but not in -# the user agent string. An example is the crawler from semalt.semalt.com. -# -# b) Crawlers that correctly access robots.txt but identify themselves in -# in the user agent string only once or just a few times. Most of the -# time a user agent string ist used that does not contain hints that -# a bot is involved. An example is the iCjobs spider. -# msnbot-UDiscovery/2.0b seems to show this behaviour too. -# -# -# -#------------------------------------------------------- - -# 2005-08-19 Sean Carlos http://www.antezeta.com/awstats.html -# added dipsie (not tested with real data). -# added DomainsDB.net http://domainsdb.net/ -# added ia_archiver-web.archive.org (was inadvertently grouped with Alexa traffic) -# added Nutch (used by looksmart (furl?)) -# added rssImagesBot -# added Sqworm -# added t\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e -# added w3c css-validator -# added documentation link to bot home pages for above and selected major bots. -# In the case of international bots, choose .com page. -# Included tool tip (html "title"). -# To do: parameterize to match both AWStats language and tooltips settings. -# To do: add html links for all bots based on current documentation in source -# files referenced below. -# changed '\wbot[\/\-]', to '\wbot[\/\-]' (removed comma) -# made minor grammar corrections to notes below -# 2005-08-24 added YahooSeeker-Testing -# added w3c-checklink -# updated url for ask.com -# 2005-08-24 added Girafabot http://www.girafa.com/ -# 2005-08-30 added PluckFeedCrawler http://www.pluck.com/ -# added Gaisbot/3.0 (robot05@gais.cs.ccu.edu.tw; ) -# dded geniebot (wgao@genieknows.com) -# added BecomeBot link http://www.become.com/site_owners.html -# added topicblogs http://www.topicblogs.com/ -# added Powermarks; seen used by referrer spam -# added YahooSeeker -# added NG/2. http://www.exabot.com/ -# 2005-09-15 added link for Walhello appie -# added bender focused_crawler -# updated YahooSeeker description (blog crawler) -# 2005-09-16 added link for http://linkchecker.sourceforge.net -# added ConveraCrawler/0.9d ( http://www.authoritativeweb.com/crawl) -# added Blogslive info@blogslive.com intelliseek.com -# added BlogPulse (ISSpider-3.0) intelliseek.com -# 2005-09-26 added Feedfetcher-Google (http://www.google.com/feedfetcher.html) -# added EverbeeCrawler -# added Yahoo-Blogs http://help.yahoo.com/help/us/ysearch/crawling/crawling-02.html -# added link for Bloglines http://www.bloglines.com -# 2005-10-19 fixed Feedfetcher-Google (http://www.google.com/feedfetcher.html) -# added Blogshares Spiders (Synchronized V1.5.1) -# added yacy -# 2005-11-21 added Argus www.simpy.com -# added BlogsSay :: RSS Search Crawler (http://www.blogssay.com/) -# added MJ12bot http://majestic12.co.uk/bot.php -# added OpenTaggerBot (http://www.opentagger.com/opentaggerbot.htm) -# added OutfoxBot/0.3 (For internet experiments; outfox.agent@gmail.com) -# added RufusBot Rufus Web Miner http://64.124.122.252.webaroo.com/feedback.html -# added Seekbot (http://www.seekbot.net/bot.html) -# added Yahoo-MMCrawler/3.x (mms-mmcrawler-support@yahoo-inc.com) -# added link for BaiDuSpider -# added link for Blogshares Spider -# added link for StackRambler http://www.rambler.ru/doc/faq.shtml -# added link for WISENutbot -# added link for ZyBorg/1.0 (wn-14.zyborg@looksmart.net; http://www.WISEnutbot.com. Moved location to above wisenut to avoid classification as wisenut -# 2005-12-15 -# added FAST Enteprise Crawler/6 (www dot fastsearch dot com). Note spelling Enteprise not Enterprise. -# added findlinks http://wortschatz.uni-leipzig.de/findlinks/ -# added IBM Almaden Research Center WebFountain鈩 http://www.almaden.ibm.com/cs/crawler [hc3] -# added INFOMINE/8.0 VLCrawler (http://infomine.ucr.edu/useragents) -# added lmspider (lmspider@scansoft.com) http://www.nuance.com/ -# added noxtrumbot http://www.noxtrum.com/ -# added SandCrawler (Microsoft) -# added SBIder http://www.sitesell.com/sbider.html -# added SeznamBot http://fulltext.seznam.cz/ -# added sohu-search http://corp.sohu.com/ (looked for //robots.txt not /robots.txt) -# added the ruffle SemanticWeb crawler v0.5 - http://www.unreach.net -# added WebVulnCrawl/1.0 libwww-perl/5.803 (looked for //robots.txt not /robots.txt) -# added Yahoo! Japan keyoshid http://www.yahoo.co.jp/ -# added Y!J http://help.yahoo.co.jp/help/jp/search/indexing/indexing-15.html -# added link for GigaBot -# added link for MagpieRSS -# added link for MSIECrawler -# 2005-12-21 -# added aipbot http://www.aipbot.com aipbot@aipbot.com [matthys70 users.sourceforge.net] -# added Everest-Vulcan Inc./0.1 (R&D project; http://everest.vulcan.com/crawlerhelp) -# added Fast-Search-Engine http://www.fast-search-engine.com/ [matthys70 users.sourceforge.net] -# added g2Crawler (nobody@airmail.net) http://crawler.instantnetworks.net/ -# added Jakarta commons-httpclient http://jakarta.apache.org/commons/httpclient/ (hit robots.txt). May be used as robot or browser - a site may want to remove this entry. -# added OmniExplorer_Bot http://www.omni-explorer.com/ [matthys70 users.sourceforge.net] -# added USTC-Semantic-Group ai.ustc.edu.cn/mas/en/research/index.php ? -# 2005-12-22 -# added EARTHCOM.info www.earthcom.info -# added HTTrack off-line browser 'httrack','HTTrack', http://www.httrack.com/ [Moizes Gabor] -# added KummHttp http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_g_l_301105_2\b [Moizes Gabor] -# 2006-01-01 -# added Dulance http://www.dulance.com/bot.jsp -# added MojeekBot http://www.mojeek.com/bot.html -# added nicebot http://www.egghelp.org/setup.htm ? -# added Snappy http://www.urltrends.com/faq.php -# added sohu agent -# added VORTEX http://marty.anstey.ca/robots/vortex/ [matthys70 users.sourceforge.net] -# added zspider http://feedback.redkolibri.com/ -# 2006-01-13 -# added boitho.com-dc http://www.boitho.com/dcbot.html -# added IRLbot http://irl.cs.tamu.edu/crawler -# added virus_detector virus_harvester@securecomputing.com -# added Wavefire http://www.wavefire.com; info@wavefire.com -# added WebFilter Robot -# 2006-01-24 -# added Shim-Crawler http://www.logos.ic.i.u-tokyo.ac.jp/crawler/; crawl@logos.ic.i.u-tokyo.ac.jp -# added Exabot exabot.com -# added LetsCrawl.com http://letscrawl.com -# added ichiro http://help.goo.ne.jp/door/crawlerE.html -# 2006-01-27 additional 22 robots from a list provided by Moizes Gabor -# added ALeadSoftbot http://www.aleadsoft.com/bot.htm -# added CipinetBot http://www.cipinet.com/bot.html -# added Cuasarbot http://www.cuasar.com/ -# added Dumbot http://www.dumbfind.com/ -# added Extreme_Picture_Finder http://www.exisoftware.com/ -# added Fooky.com/ScorpionBot/ScoutOut http://www.fooky.com/scorpionbots -# added IlTrovatore-Setaccio http://www.iltrovatore.it/aiuto/motore_di_ricerca.html bot@iltrovatore.it -# added InsurancoBot http://www.fastspywareremoval.com/ -# added InternetArchive http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org -# added KazoomBot http://www.kazoom.ca/bot.html kazoombot@kazoom.ca -# added Kurzor http://www.easymail.hu/ cursor@easymail.hu -# added NutchCVS http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org -# added NutchOSU-VLIB http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org -# added Orbiter http://www.dailyorbit.com/bot.htm -# added PHP_version_tracker http://www.nexen.net/phpversion/bot.php -# added SuperBot http://www.sparkleware.com/superbot/ -# added SynooBot http://www.synoo.de/bot.html webmaster@synoo.com -# added TestBot http://www.agbrain.com/ -# added TutorGigBot http://www.tutorgig.info/ -# added WebIndexer mailto://webindexerv1@yahoo.com -# added WebMiner http://64.124.122.252/feedback.html -# 2006-02-01 -# added heritrix https://sourceforge.net/forum/message.php?msg_id=3550202 -# added Zeus Webster Pro https://sourceforge.net/forum/message.php?msg_id=3141164 -# additional robots from a list provided by Moizes Gabor [ mojzi -a-t- free mail hu ] -# added Candlelight_Favorites_Inspector -# added DomainChecker -# added EasyDL -# added FavOrg -# added Favorites_Sweeper -# added Html_Link_Validator -# added Internet_Ninja -# added JRTwine_Software_Check_Favorites_Utility -# fixed Microsoft_URL_Control -# added miniRank -# added Missigua_Locator -# added NPBot -# added Ocelli -# added Onet.pl_SA -# added proodleBot -# added SearchGuild_DMOZ_Experiment -# added Susie -# added Website_Monitoring_Bot -# added Xenu_Link_Sleuth -# 2006-05-15 -# added ASPseek http://www.aspseek.org/ -# added AdamM Bot http://home.blic.net/adamm/ -# added archive.org_bot http://crawls.archive.org/collections/bncf/crawl.html -# added arianna.libero.it (Italian Portal/search engine) -# added Biz360 spider http://www.biz360.com -# added BlogBridge Service http://www.blogbridge.com/ -# added BlogSearch http://www.icerocket.com/ -# added libcrawl -# added edgeio-relanshanbottriever http://www.edgeio.com -# added FeedFlow http://feedflow.com/about -# added Biblioteca Nazionale Centrale di Firenze (Italian National Archive) http://www.bncf.firenze.sbn.it/raccolta.txt -# added Java catchall - used by many spam bots -# added lanshanbot http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_g_l_140406_1%5Cb -# added msnbot-media http://search.msn.com/msnbot.htm -# added MT::Telegraph::Agent -# added Netluchs http://www.netluchs.de/ (German SE bot) -# added oBot http://www.webmasterworld.com/forum11/1616.htm -# added Onfolio http://www.onfolio.com/ (IE Toolbar plugin) - hit rss feeds. -# added ping.blo.gs http://blo.gs/ping.php blog bot -# added Sphere Scout http://www.sphere.com/ -# added sproose crawler http://www.sproose.com/bot.html -# added SyndicAPI http://syndicapi.com/bot.html -# added Yahoo! Mindset http://mindset.research.yahoo.com/ -# added msrabot -# added Vagabondo & Vagabondo-WAP http://www.wise-guys.nl/Contact/index.php?botselected=webagents&lang=uk -# fixed Missigua Locator detection (Missigua_Locator -> Missigua Locator) -# changed echo to echo! to avoid conflict with the bonecho (Firefox 2.0) browser. -# This requires you to reprocess historic logs if you want EchO! to be recognized for older reports. -# 2006-05-17 -# added Alpha Search Agent # 62.152.125.60 Eurologon Srl -# added Krugle http://www.krugle.com/crawler/info.html the search engine for developers -# added Octora Beta Bot http://www.octora.com/ # Blog and Rss Search Engine -# added UbiCrawler http://law.dsi.unimi.it/ubicrawler/ -# added Yahoo! Slurp China http://misc.yahoo.com.cn/help.html -# You must reprocess old logs for the Yahoo! Slurp China bot to be detected in old reports -# 2006-05-20 -# added 1-More Scanner http://www.myzips.com/software/1-More-Scanner.phtml -# added Accoona-AI-Agent http://www.accoona.com/ -# added ActiveBookmark http://www.libmaster.com/active_bookmark.php -# added BIGLOTRON http://www.biglotron.com/robot.html -# added Bookmark-Manager http://bkm.sourceforge.net/ -# added cbn00glebot -# added Cerberian Drtrs http://www.pgts.com.au/cgi-bin/psql?robot_info=25240 -# added CFNetwork http://www.cocoadev.com/index.pl?CFNetwork -# added CheckWeb link validator http://p.duby.free.fr/chkweb.htm -# added Computer and Automation Research Institute Crawler http://www.ilab.sztaki.hu/~stamas/publications/p184-benczur.html -# added ConveraCrawler http://www.authoritativeweb.com/crawl/ -# added ConveraMultiMediaCrawler http://www.authoritativeweb.com/crawl/ -# added CSE HTML Validator Lite Online http://online.htmlvalidator.com/php/onlinevallite.php -# added Cursor http://adcenter.hu/docs/en/bot.html -# added Custo http://www.netwu.com/custo/ -# added DataFountains/DMOZ Downloader http://infomine.ucr.edu/ -# added Deepindex http://www.deepindex.net/faq.php -# added DNSGroup http://www.dnsgroup.com/ -# added DoCoMo http://www.nttdocomo.co.jp/ -# added dumm.de-Bot http://www.dumm.de/ -# added ETS v http://www.freetranslation.com/help/ -# added eventax http://www.eventax.de/ -# added FAST Enterprise Crawler * crawleradmin.t-info@telekom.de http://www.telekom.de/ -# added FAST Enterprise Crawler http://www.fast.no/ -# added FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de http://www.telekom.de/ -# added FeedValidator http://feedvalidator.org/ -# added FilmkameraBot http://www.filmkamera.at/bot.html -# added Findexa Crawler http://www.findexa.no/gulesider/article26548.ece -# added Global Fetch http://www.wesonet.com/ -# added GOFORITBOT http://www.goforit.com/about/ -# added GoForIt.com http://www.goforit.com/about/ -# added GPU p2p crawler http://gpu.sourceforge.net/search_engine.php -# added HooWWWer http://cosco.hiit.fi/search/hoowwwer/ -# added HPPrint -# added HTMLParser http://htmlparser.sourceforge.net/ -# added Hundesuche.com-Bot http://www.hundesuche.com/ -# added InfoBot http://www.infobot.org/ -# added InfociousBot http://corp.infocious.com/tech_crawler.php -# added InternetSupervision http://internetsupervision.com/ -# added isearch2006 http://www.yahoo.com.cn/ -# added IUPUI_Research_Bot http://spamhuntress.com/2005/04/25/a-mail-harvester-visits/ -# added KalamBot http://64.124.122.251/feedback.html -# added kamano.de NewsFeedVerzeichnis http://www.kamano.de/ -# added Kevin http://dznet.com/kevin/ -# added KnowItAll http://www.cs.washington.edu/research/knowitall/ -# added Knowledge.com http://www.knowledge.com/ -# added Kouaa Krawler http://www.kouaa.com/ -# added ksibot http://ego.ms.mff.cuni.cz/ -# added Link Valet Online http://www.htmlhelp.com/tools/valet/ -# added lwp-request http://search.cpan.org/~gaas/libwww-perl-5.69/bin/lwp-request -# added lwp-trivial http://search.cpan.org/src/GAAS/libwww-perl-5.805/lib/LWP/Simple.pm -# added MapoftheInternet.com http://MapoftheInternet.com/ -# added Matrix S.p.A. - FAST Enterprise Crawler http://tin.virgilio.it/ -# added Megite http://www.megite.com/ -# added Metaspinner http://index.meta-spinner.de/ -# added Mini-reptile -# added Misterbot http://www.misterbot.fr/ -# added Miva http://www.miva.com/ -# added Mizzu Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_m_141105_2\b -# added MSRBOT http://research.microsoft.com/research/sv/msrbot/ -# added MS SharePoint Portal Server - MS Search 4.0 Robot http://support.microsoft.com/default.aspx?scid=kb;en-us;284022 -# added Mydoyouhike http://www.doyouhike.net/my -# added NASA Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_140506_2\b -# added NetSprint http://www.netsprint.pl/serwis/ -# added NimbleCrawler http://www.healthline.com/ -# added OpenWebSpider http://www.openwebspider.org/ -# added Oracle Ultra Search http://www.oracle.com/technology/products/ultrasearch/index.html -# added OSSProxy http://www.marketscore.com/FAQ.Aspx -# added passwordmaker.org http://passwordmaker.org/ -# added PEAR HTTP Request class http://pear.php.net/ -# added PEERbot http://www.peerbot.com/ -# added PHP version tracker http://www.nexen.net/phpversion/bot.php -# added PictureOfInternet http://malfunction.org/poi/ -# added plinki http://www.plinki.com/ -# added Port Huron Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1133\b -# added PostFavorites http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1135\b -# added ProjectWF-java-test-crawler -# added PyQuery http://sourceforge.net/projects/pyquery/ -# added Schizozilla http://spamhuntress.com/2005/03/18/gizmo/ -# added Scumbot -# added Sensis Web Crawler http://www.sensis.com.au/ -# added snap.com beta crawler http://www.snap.com/ -# added Steeler http://www.tkl.iis.u-tokyo.ac.jp/~crawler/ -# added STEROID Download http://faqs.org.ru/progr/pascal/delphi_internet2.htm -# added Suchfin-Bot http://www.suchfin.de/ -# added Sunrise http://www.sunrisexp.com/ -# added Tagyu Agent http://www.tagyu.com/ -# added Tcl http client package http://www.tcl.tk/man/tcl8.4/TclCmd/http.htm -# added TeragramCrawlerSURF http://www.teragram.com/ -# added Test Crawler http://netp.ath.cx/ -# added UnChaos Bot Hybrid Web Search Engine http://www.unchaos.com/ -# added unido-bot http://www.unchina.org/unido/unido/our_projects/3_3.html -# added UniversalFeedParser http://feedparser.org/ (seen from md301000.inktomisearch.com) -# added updated http://www.updated.com/ -# added Vermut http://vermut.aol.com -# added versus crawler from eda.baykan@epfl.ch http://www.epfl.ch/Eindex.html -# added Vespa Crawler (Yahoo Norway?) http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_t_z_030406_1%5Cb -# added VSE http://www.vivisimo.com/ -# added webcrawl.net http://www.webcrawl.net/ -# added Web Downloader http://www.krasu.ru/soft/chuchelo/ -# added Webdup http://www.webdup.com/en/index.html -# added Wells Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_t_z_1484\b -# added WordPress http://wordpress.org/ -# added wume crawler http://wume.cse.lehigh.edu/~xiq204/crawler/ -# added Xenu's Link Sleuth (with ') -# added xirq http://www.xirq.com/ -# added yoogliFetchAgent http://www.yoogli.com/ -# added Z-Add Link Checker http://w3.z-add.co.uk/linkcheck/ -# -- fix - some robots were reported with _ where _ should have been a space. -# changed Xenu Link Sleuth -# changed microsoft[_+\s]url[_+\s]control -> microsoft_url_control -# changed favorites_sweeper -> favorites_sweeper -# -- updates -# updated AskJeeves to Ask -# 2012-06-05 Albrecht Mueller -# added Grabber from SDSC (San Diego Supercomputer Center). -# 2013-09-30 Albrecht Mueller -# AWStats probably cannot detect this bot as it identifies itself in -# the referrer field and not in the user agent string. -#92.113.100.35 - - [29/Sep/2013:17:22:46 +0200] "GET /robots.txt HTTP/1.1" 200 516 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-" -#92.113.100.35 - - [29/Sep/2013:17:22:49 +0200] "GET /tghome.htm HTTP/1.1" 200 4445 "http://extrabot.com/help/frytygativyheku.htm" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-" -#92.113.100.35 - - [29/Sep/2013:17:22:51 +0200] "GET / HTTP/1.1" 200 5467 "http://extrabot.com/help/frytygativyheku.htm" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-" - -# to do MS Search 4.0 Robot - -#package AWSROB; - - -# Robots list was found at http://www.robotstxt.org/wc/active/all.txt -# Other robots can be found at http://www.jafsoft.com/searchengines/webbots.html -# Rem: To avoid bad detection, some robot's ids were removed from this list: -# - Robots with ID of 3 letters only -# - Robots called 'webs' and 'tcl' -# Rem: directhit changed into direct_hit (its real id) -# Rem: calif changed into calif[^r] to avoid confusion between Tiscalifreenet browser -# Rem: fish changed into [^a]fish to avoid confusion between Madsafish browser -# Rem: roadrunner changed into road_runner -# Rem: lycos changed to lycos_ to avoid confusion with lycos-online browser -# Rem: voyager changed into ^voyager\/ to avoid to exclude voyager and amigavoyager browser - -# RobotsSearchIDOrder -# It contains all matching criteria to search for in log fields. This list is -# used to know in which order to search Robot IDs. -# Most frequent ones are in list1, used when LevelForRobotsDetection is 1 or more -# Minor robots are in list2, used when LevelForRobotsDetection is 2 or more -# Note: Robots IDs are in lower case, '_', ' ' and '+' are changed into '[_+\s]' and are quoted. -#------------------------------------------------------- -@RobotsSearchIDOrder_list1 = ( -# Common robots (In robot file) -'appie', -'architext', -'bingpreview', -'bjaaland', -'contentmatch', -'ferret', -'googlebot\-image', -'googlebot', -'google\-sitemaps', -'google[_+\s]web[_+\s]preview', -'grabber', -'gulliver', -'virus[_+\s]detector', # Must be before harvest -'harvest', -'htdig', -'jeeves', -'linkwalker', -'lilina', -'lycos[_+\s]', -'moget', -'muscatferret', -'myweb', -'nomad', -'scooter', -'slurp', -'^voyager\/', -'weblayers', -# Common robots (Not in robot file) -'antibot', -'bruinbot', -'digout4u', -'echo!', -'fast\-webcrawler', -'ia_archiver\-web\.archive\.org', # Must be before ia_archiver to avoid confusion with alexa -'ia_archiver', -'jennybot', -'mercator', -'netcraft', -'msnbot\-media', -'msnbot-udiscovery', -'msnbot', -'petersnews', -'relevantnoise\.com', -'unlost_web_crawler', -'voila', -'webbase', -'webcollage', -'cfetch', -'zyborg', # Must be before wisenut -'wisenutbot' -); -@RobotsSearchIDOrder_list2 = ( -# Less common robots (In robot file) -'007ac9', -'[^a]fish', -'abcdatos', -'abonti\.com', -'acme\.spider', -'ahoythehomepagefinder', -'ahrefsbot', -'alkaline', -'anthill', -'arachnophilia', -'arale', -'araneo', -'aretha', -'ariadne', -'powermarks', -'arks', -'aspider', -'atn\.txt', -'atomz', -'auresys', -'backrub', -'bbot', -'bigbrother', -'blackwidow', -'blindekuh', -'bloodhound', -'borg\-bot', -'brightnet', -'bspider', -'cactvschemistryspider', -'calif[^r]', -'cassandra', -'cgireader', -'checkbot', -'christcrawler', -'churl', -'cienciaficcion', -'cms\scrawler', -'collective', -'combine', -'conceptbot', -'coolbot', -'core', -'cosmos', -'crazywebcrawler', -'cruiser', -'cusco', -'cyberspyder', -'desertrealm', -'deweb', -'dienstspider', -'digger', -'diibot', -'direct_hit', -'dnabot', -'domainappender', -'download_express', -'dragonbot', -'dwcp', -'e\-collector', -'ebiness', -'elfinbot', -'emacs', -'emcspider', -'esther', -'evliyacelebi', -'fastcrawler', -'feedcrawl', -'fdse', -'felix', -'fetchrover', -'fido', -'finnish', -'fireball', -'fouineur', -'francoroute', -'freecrawl', -'funnelweb', -'gama', -'gazz', -'gcreep', -'getbot', -'geturl', -'golem', -'gougou', -'grapnel', -'griffon', -'gromit', -'gulperbot', -'hambot', -'havindex', -'hometown', -'htmlgobble', -'hyperdecontextualizer', -'iajabot', -'iaskspider', -'hl_ftien_spider', -'sogou', -'icjobs\.de', -'iconoclast', -'ilse', -'imagelock', -'incywincy', -'informant', -'infoseek', -'infoseeksidewinder', -'infospider', -'inspectorwww', -'intelliagent', -'irobot', -'iron33', -'israelisearch', -'javabee', -'jbot', -'jcrawler', -'jobo', -'jobot', -'joebot', -'jubii', -'jumpstation', -'kapsi', -'katipo', -'kilroy', -'ko[_+\s]yappo[_+\s]robot', -'kummhttp', -'labelgrabber\.txt', -'larbin', -'legs', -'linkidator', -'linkscan', -'lockon', -'logo_gif', -'macworm', -'magpie', -'marvin', -'mattie', -'mediafox', -'merzscope', -'meshexplorer', -'mindcrawler', -'mnogosearch', -'momspider', -'monster', -'motor', -'muncher', -'mwdsearch', -'ndspider', -'nederland\.zoek', -'netcarta', -'netmechanic', -'netscoop', -'newscan\-online', -'nhse', -'northstar', -'nzexplorer', -'objectssearch', -'occam', -'octopus', -'openfind', -'orb_search', -'packrat', -'pageboy', -'parasite', -'patric', -'pegasus', -'perignator', -'perlcrawler', -'phantom', -'phpdig', -'piltdownman', -'pimptrain', -'pioneer', -'pitkow', -'pjspider', -'plumtreewebaccessor', -'poppi', -'portalb', -'psbot', -'python', -'raven', -'rbse', -'resumerobot', -'rhcs', -'road_runner', -'robbie', -'robi', -'robocrawl', -'robofox', -'robozilla', -'roverbot', -'rules', -'safetynetrobot', -'semalt', #Note: This entry will not work as this crawler identifies itself -# in the referrer string and not in the user agent string -'search\-info', -'search_au', -'searchprocess', -'senrigan', -'sgscout', -'shaggy', -'shaihulud', -'sift', -'simbot', -'sistrix', #Virus/trojan-infection? fr-crawler, ca-crawler? See https://www.projecthoneypot.org/ip_37.59.55.128, https://www.projecthoneypot.org/ip_198.27.80.144 -'site\-valet', -'sitetech', -'skymob', -'slcrawler', -'smartspider', -'snooper', -'solbot', -'speedy', -'spider[_+\s]monkey', -'spiderbot', -'spiderline', -'spiderman', -'spiderview', -'spry', -'sqworm', -'ssearcher', -'suke', -'sunrise', -'suntek', -'sven', -'tach_bw', -'tagyu_agent', -'tailrank', -'tarantula', -'tarspider', -'techbot', -'templeton', -'titan', -'titin', -'tkwww', -'tlspider', -'ucsd', -'udmsearch', -'universalfeedparser', -'urlck', -'valkyrie', -'verticrawl', -'victoria', -'visionsearch', -'voidbot', -'vwbot', -'w3index', -'w3m2', -'wallpaper', -'wanderer', -'wapspIRLider', -'webbandit', -'webcatcher', -'webcopy', -'webfetcher', -'webfoot', -'webinator', -'weblinker', -'webmirror', -'webmoose', -'webquest', -'webreader', -'webreaper', -'websnarf', -'webspider', -'webvac', -'webwalk', -'webwalker', -'webwatch', -'whatuseek', -'whowhere', -'wired\-digital', -'wmir', -'wolp', -'wombat', -'wordpress', -'worm', -'woozweb', -'wwwc', -'wz101', -'xenu\slink\ssleuth', -'xget', -# Other robots reported by users -'^finbot', #UA string starts with "finbot", should not match "elfinbot" -'^webindex$', #UA should not match "webindexer" -'1\-more_scanner', -'360spider', -'a6-indexer', -'accoona\-ai\-agent', -'activebookmark', -'adamm_bot', -'adsbot-google', -'advbot', -'affectv\.co\.uk', -'almaden', -'aipbot', -'aleadsoftbot', -'alpha_search_agent', -'allrati', -'aport', -'applebot', -'archive\-de\.com', -'archive\.org_bot', -'argus', # Must be before nutch -'arianna\.libero\.it', -'aspseek', -'asterias', -'awbot', -'backlinktest\.com', -'baiduspider', -'becomebot', -'bender', -'betabot', -'biglotron', -'bittorrent_bot', -'biz360[_+\s]spider', -'blexbot', -'blogbridge[_+\s]service', -'bloglines', -'blogpulse', -'blogsearch', -'blogshares', -'blogslive', -'blogssay', -'bncf\.firenze\.sbn\.it\/raccolta\.txt', -'bobby', -'boitho\.com\-dc', -'bookmark\-manager', -'boris', -'bubing', -'bumblebee', -'candlelight[_+\s]favorites[_+\s]inspector', -'careerbot', -'cbn00glebot', -'ccbot', -'cerberian_drtrs', -'cfnetwork', -'cipinetbot', -'checkweb_link_validator', -'cliqzbot', -'commons\-httpclient', -'computer_and_automation_research_institute_crawler', -'converamultimediacrawler', -'converacrawler', -'copubbot', -'cscrawler', -'cse_html_validator_lite_online', -'cuasarbot', -'cursor', -'custo', -'datafountains\/dmoz_downloader', -'dataprovider\.com', -'daumoa', -'daviesbot', -'daypopbot', -'deepindex', -'deusu', -'dipsie\.bot', -'dnsgroup', -'doccheckbot', -'domainchecker', -'domainsdb\.net', -'dotbot', -'duckduckgo-favicons-bot', -'dulance', -'dumbot', -'dumm\.de\-bot', -'earthcom\.info', -'easydl', -'eccp', -'edgeio\-retriever', -'ernst[:blank:]2\.0', -'ets_v', -'exactseek', -'extreme[_+\s]picture[_+\s]finder', -'eventax', -'everbeecrawler', -'everest\-vulcan', -'ezresult', -'enteprise', -'facebook', -'facebot', -'fast_enterprise_crawler.*crawleradmin\.t\-info@telekom\.de', -'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de', -'finderlein[_+\s]research[_+\s]crawler', -'matrix_s\.p\.a\._\-_fast_enterprise_crawler', # must come before fast enterprise crawler -'fast_enterprise_crawler', -'fast\-search\-engine', -'fastbot', -'favicon', -'favorg', -'favorites_sweeper', -'feedburner', -'feedfetcher\-google', -'feedflow', -'feedster', -'feedsky', -'feedvalidator', -'fetchbot', -'filmkamerabot', -'filterdb\.iss\.net', -'findlinks', -'findexa_crawler', -'firmilybot', -'foaf-search\.net', -'fooky\.com\/ScorpionBot', -'g2crawler', -'gaisbot', -'geniebot', -'genieo', -'gigablastopensource', -'gigabot', -'girafabot', -'global_fetch', -'gnodspider', -'goforit\.com', -'goforitbot', -'gonzo', -'grapeshot', -'grub', -'gpu_p2p_crawler', -'henrythemiragorobot', -'heritrix', -'holmes', -'hoowwwer', -'hpprint', -'htmlparser', -'html[_+\s]link[_+\s]validator', -'httrack', -'hundesuche\.com\-bot', -'i-bot', -'icarus6j', -'ichiro', -'idmarch', -'iltrovatore\-setaccio', -'implisensebot', -'infobot', -'infociousbot', -'infohelfer', -'infomine', -'insurancobot', -'integromedb\.org', -'internet[_+\s]ninja', -'internetarchive', -'internetseer', -'internetsupervision', -'ips\-agent', -'irlbot', -'isearch2006', -'istellabot', -'iupui_research_bot', -'izsearch', -'james\sbot', -'jobboerse', #AWStats seems not to find this one despite the fact that "JobboerseBot" and "jobboerse.com" appear in the UA-string, maybe some previous entry matches -'jrtwine[_+\s]software[_+\s]check[_+\s]favorites[_+\s]utility', -'justview', -'kalambot', -'kamano\.de_newsfeedverzeichnis', -'kazoombot', -'kevin', -'keyoshid', # Must come before Y!J -'kinjabot', -'kinja\-imagebot', -'knowitall', -'knowledge\.com', -'kouaa_krawler', -'krugle', -'ksibot', -'kurzor', -'lanshanbot', -'letscrawl\.com', -'libcrawl', -'linkbot', -'linkdex\.com', -'link_valet_online', -'metager\-linkchecker', # Must be before linkchecker -'linkchecker', -'linkstats\sbot', -'lipperhey', -'livejournal\.com', -'lmspider', -'loadtimebot', -'lssrocketcrawler', -'ltbot', -'ltx71', -'lwp\-request', -'lwp\-trivial', -'madaali\.de', -'magpierss', -'mail\.ru', -'mapoftheinternet\.com', -'meanpathbot', -'mediabot', -'mediapartners\-google', -'megaindex', -'megite', -'memorybot', -'metager2-verification-bot', -'metajobbot', #Does not show up in the results of Sep. 2015 despite the fact that the corresponing log file has about 40 entries containing "MetaJobBot" in the UA string - strange. -'metaspinner', -'miadev', -'microsoft\sbits', -'microsoft.*discovery', # = 'microsoft (?:office (?:protocol|existence)|data access internet publishing provider protocol) discovery', -'microsoft[_+\s]url[_+\s]control', -'mindupbot', -'mini\-reptile', -'minirank', -'missigua_locator', -'misterbot', -'miva', -'mizzu_labs', -'mj12bot', -'mojeekbot', -'msiecrawler', -'ms[_+\s]search[_+\s]6\.0[_+\s]robot', -'ms_search_4\.0_robot', -'msrabot', -'msrbot', -'mt::telegraph::agent', -'mydoyouhike', -'nagios', -'nasa_search', -'netestate\sne\scrawler', -'netluchs', -'netsprint', -'newsgatoronline', -'nicebot', -'nimblecrawler', -'noxtrumbot', -'npbot', -'loocalcrawler/nutch', -'nutchcvs', -'nutchosu\-vlib', -'nutch', # Must come after other nutch versions -'ocelli', -'octora_beta_bot', -'omniexplorer[_+\s]bot', -'onet\.pl[_+\s]sa', -'onfolio', -'opentaggerbot', -'openwebspider', -'optimizer', -'oracle_ultra_search', -'orangebot', -'orbiter', -'yodaobot', -'qihoobot', -'qwantify', -'passwordmaker\.org', -'pear_http_request_class', -'peerbot', -'perman', -'php[_+\s]version[_+\s]tracker', -'phpcrawl', -'picmole', -'pictureofinternet', -'ping\.blo\.gs', -'plinki', -'pluckfeedcrawler', -'plukkie', -'pogodak', -'pompos', -'popdexter', -'port_huron_labs', -'postfavorites', -'projectwf\-java\-test\-crawler', -'proodlebot', -'publiclibraryarchive', -'pyquery', -'rambler', -'redalert', -'riddler', -'rogerbot', -'rojo', -'rssimagesbot', -'ruffle', -'rufusbot', -'safeads\.xyz', -'safesearch', -'sandcrawler', -'savetheworldheritage', -'sbider', -'schizozilla', -'scumbot', -'searchguild[_+\s]dmoz[_+\s]experiment', -'searchmetricsbot', -'seekbot', -'semrushbot', -'sensis_web_crawler', -'seodiver', -'seokicks\.de', -'seoscanners', -'seznambot', -'shim\-crawler', -'shoutcast', -'sitedomain-bot', -'siteexplorer\.info', -'skimbot', -'slysearch', -'smtbot', -'snap\.com_beta_crawler', -'sohu\-search', -'sohu', # "sohu agent" -'snappy', -'spbot', -'sphere_scout', -'spiderlytics', -'spip', -'sproose_crawler', -'ssearch_bot', -'steeler', -'steroid__download', -'stq_bot', -'suchfin\-bot', -'superbot', -'surveybot', -'susie', -'syndic8', -'syndicapi', -'synoobot', -'tcl_http_client_package', -'technoratibot', -'teragramcrawlersurf', -'test_crawler', -'testbot', -'thumbsniper', -'t\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e', -'topicblogs', -'turnitinbot', -'turtlescanner', # Must be before turtle -'turtle', -'tutorgigbot', -'twiceler', -'ubicrawler', -'ultraseek', -'unchaos_bot_hybrid_web_search_engine', -'unido\-bot', -'unisterbot', -'updated', -'ustc\-semantic\-group', -'vagabondo\-wap', -'vagabondo', -'vebidoobot', -'vermut', -'versus_crawler_from_eda\.baykan@epfl\.ch', -'vespa_crawler', -'voltron', -'vortex', -'vse\/', -'w3c\-checklink', -'w3c[_+\s]css[_+\s]validator[_+\s]jfouffa', -'w3c_validator', -'watchmouse', -'wavefire', -'waybackarchive\.org', -'wbsearchbot', -'webclipping\.com', -'webcompass', -'webcrawl\.net', -'web_downloader', -'webdup', -'webfilter', -'webindexer', -'webminer', -'website[_+\s]monitoring[_+\s]bot', -'webvulncrawl', -'wells_search', -'wer-liefert-was', -'wesee:search', -'wevikabot', -'wonderer', -'wotbox', -'wume_crawler', -'wwweasel', -'xenu\'s_link_sleuth', -'xenu_link_sleuth', -'xirq', -'xovibot', -'y!j', # Must come after keyoshid Y!J -'yacy', -'yahoo\-blogs', -'yahoo\-verticalcrawler', -'yahoofeedseeker', -'yahooseeker\-testing', -'yahooseeker', -'yahoo\-mmcrawler', -'yahoo!_mindset', -'yandex', -'flexum', -'yanga', -'yet-another-spider', -'yisouspider', -'yooglifetchagent', -'z\-add_link_checker', -'zealbot', -'zhuaxia', -'zspider', -'zeus', -'ng\/1\.', # put at end to avoid false positive -'ng\/2\.', # put at end to avoid false positive -'exabot', # put at end to avoid false positive -# Additional bots found by Sussex. -'^[1-3]$', # Hiding bots. Doesn't appear to be a valid user agent. -'alltop', -'applesyndication', -'asynchttpclient', -'bingbot', -'blogged_crawl', -'bloglovin', -'butterfly', -'buzztracker', -'carpathia', -'catbot', -'chattertrap', -'check_http', #(nagios) a monitoring tool -'coldfusion', -'covario', -'daylifefeedfetcher', -'discobot', -'dlvr\.it', -'dreamwidth', -'drupal', -'ezoom', -'feedmyinbox', -'feedroll\.com', -'feedzira', -'fever\/', -'freenews', -'geohasher', -'hanrss', -'inagist', -'jacobin\sclub', -'jakarta', -'js\-kit', -'largesmall\scrawler', -'linkedinbot', -'longurl', -'metauri', -'microsoft\-webdav\-miniredir', -'^motorola$', -'movabletype', -# These appear to be bots trying to hide. All of the usual architecture data is missing. -'^mozilla\/3\.0\s\(compatible$', -'^mozilla\/4\.0$', -'^mozilla\/4\.0\s\(compatible;\)$', -'^mozilla\/5\.0$', -'^mozilla\/5\.0\s\(compatible;$', -'^mozilla\/5\.0\s\(en\-us\)$', -'^mozilla\/5\.0\sfirefox\/3\.0\.5$', -'^msie', -# End of hiding bots. -'netnewswire', -'\snetseer\s', -'netvibes', -'newrelicpinger', -'newsfox', -'nextgensearchbot', -'ning', -'pingdom', -'pita', -'postpost', -'postrank', -'printfulbot', -'protopage', -'proximic', -'quipply', -'r6\_', -'ratingburner', -'regator', -'rome\sclient', -'rpt\-httpclient', -'rssgraffiti', -'sage\+\+', -'scoutjet', -'simplepie', -'sitebot', -'summify\.com', -'superfeedr', -'synthesio', -'teoma', -'topblogsinfo', -'topix\.net', -'trapit', -'trileet', -'tweetedtimes', -'twisted\spagegetter', -'twitterbot', -'twitterfeed', -'unwindfetchor', -'wazzup', -'windows\-rss\-platform', -'wiumi', -'xydo', -'yahoo!\sslurp', -'yahoo\spipes', -'yahoo\-newscrawler', -'yahoocachesystem', -'yahooexternalcache', -'yahoo!\ssearchmonkey', -'yahooysmcm', -'yammer', -# 'yandexbot', #already covered by 'yandex' -'yeti', -'yie8', -'youdao', -'yourls', -'zemanta', -'zend_http_client', -'zumbot', -# Other id that are 99% of robots -'wget', -'libwww', -'^java\/[0-9]' # put at end to avoid false positive -); -@RobotsSearchIDOrder_listgen = ( -# Generic robot -'robot', -'checker', -'crawl', -'discovery', -'hunter', -'scanner', -'spider', -'sucker', -'bot[\s_+:,\.\;\/\\\-]', -# Identifies -#"Mozilla/5.0 (Linux; U; Android 4.2.2; de-de; CUBOT P9 Build/JDQ39) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30" -#as a but. There is a Android mobile phone called "CUBOT P9", so this is probably not a bot. -'[\s_+:,\.\;\/\\\-]bot', -'curl', -'php', -'ruby\/', -'no_user_agent' -); - - - -# RobotsHashIDLib -# List of robots names ('robot id','robot clear text') -#------------------------------------------------------- -%RobotsHashIDLib = ( -# Common robots (In robot file) -'appie','Walhello appie', -'architext','ArchitextSpider', -'bingpreview','Bing Preview bot', -'bjaaland','Bjaaland', -'ferret','Wild Ferret Web Hopper #1, #2, #3', -'contentmatch','Yahoo!China ContentMatch Crawler', -'googlebot\-image','Googlebot-Image', -'googlebot','Googlebot', -'google\-sitemaps', 'Google Sitemaps', -'grabber', 'Grabber (SDSC)', -'google[_+\s]web[_+\s]preview', 'Google Web Preview', -'gulliver','Northern Light Gulliver', -'virus[_+\s]detector','virus_detector', -'harvest','Harvest', -'htdig','ht://Dig', -'jeeves','Ask', -'linkwalker','LinkWalker', -'lilina','Lilina', -'lycos[_+\s]','Lycos', -'moget','moget', -'muscatferret','Muscat Ferret', -'myweb','Internet Shinchakubin', -'nomad','Nomad', -'scooter','Scooter', -'slurp','Yahoo Slurp', -'^voyager\/','Voyager', -'weblayers','Weblayers', -# Common robots (Not in robot file) -'antibot','Antibot', -'bruinbot','The web archive', -'digout4u','Digout4u', -'echo!','EchO!', -'fast\-webcrawler','Fast-Webcrawler', -'ia_archiver\-web\.archive\.org','The web archive (IA Archiver)', -'ia_archiver','Alexa (IA Archiver)', -'jennybot','JennyBot', -'mercator','Mercator', -'msnbot\-media','MSNBot-media', -'msnbot-udiscovery', 'msnbot-UDiscovery Note: AWStats counts most of its traffic as user traffic', -'msnbot','MSNBot', -'netcraft','Netcraft', -'petersnews','Petersnews', -'unlost_web_crawler','Unlost Web Crawler', -'voila','Voila', -'webbase', 'WebBase', -'zyborg','ZyBorg', -'wisenutbot','WISENutbot', -'webcollage','WebCollage', -'cfetch','Cfetch', -# Less common robots (In robot file) -'007ac9', '007ac9 Crawler, seems to belong to SISTRIX', -'[^a]fish','Fish search', -'abcdatos','ABCdatos BotLink', -'abonti\.com','Abonti WebSearch', -'acme\.spider','Acme.Spider', -'ahoythehomepagefinder','Ahoy! The Homepage Finder', -'ahrefsbot', 'AhrefsBot', -'alkaline','Alkaline', -'anthill','Anthill', -'arachnophilia','Arachnophilia', -'arale','Arale', -'araneo','Araneo', -'aretha','Aretha', -'ariadne','ARIADNE', -'powermarks','Powermarks', # must come before Arks; seen used by referrer spam -'arks','arks', -'aspider','ASpider (Associative Spider)', -'atn\.txt','ATN Worldwide', -'atomz','Atomz.com Search Robot', -'auresys','AURESYS', -'backrub','BackRub', -'bbot','BBot', -'bigbrother','Big Brother', -'blackwidow','BlackWidow', -'blindekuh','Die Blinde Kuh', -'bloodhound','Bloodhound', -'borg\-bot','Borg-Bot', -'brightnet','bright.net caching robot', -'bspider','BSpider', -'cactvschemistryspider','CACTVS Chemistry Spider', -'calif[^r]','Calif', -'cassandra','Cassandra', -'cgireader','Digimarc Marcspider/CGI', -'checkbot','Checkbot', -'christcrawler','ChristCrawler.com', -'churl','churl', -'cienciaficcion','cIeNcIaFiCcIoN.nEt', -'cms\scrawler', 'CMS Crawler', -'collective','Collective', -'combine','Combine System', -'conceptbot','Conceptbot', -'coolbot','CoolBot', -'core','Web Core / Roots', -'cosmos','XYLEME Robot', -'crazywebcrawler', 'CrazyWeb Crawler', -'cruiser','Internet Cruiser Robot', -'cusco','Cusco', -'cyberspyder','CyberSpyder Link Test', -'desertrealm','Desert Realm Spider', -'deweb','DeWeb(c) Katalog/Index', -'dienstspider','DienstSpider', -'digger','Digger', -'diibot','Digital Integrity Robot', -'direct_hit','Direct Hit Grabber', -'dnabot','DNAbot', -'domainappender', 'DomainAppender', -'download_express','DownLoad Express', -'dragonbot','DragonBot', -'dwcp','DWCP (Dridus\' Web Cataloging Project)', -'e\-collector','e-collector', -'ebiness','EbiNess', -'elfinbot','ELFINBOT', -'emacs','Emacs-w3 Search Engine', -'emcspider','ananzi', -'esther','Esther', -'evliyacelebi','Evliya Celebi', -'fastcrawler','FastCrawler', -'feedcrawl','FeedCrawl by feed@aobo.com', -'fdse','Fluid Dynamics Search Engine robot', -'felix','Felix IDE', -'fetchrover','FetchRover', -'fido','fido', -'finnish','Finnish', -'fireball','KIT-Fireball', -'fouineur','Fouineur', -'francoroute','Robot Francoroute', -'freecrawl','Freecrawl', -'funnelweb','FunnelWeb', -'gama','gammaSpider, FocusedCrawler', -'gazz','gazz', -'gcreep','GCreep', -'getbot','GetBot', -'geturl','GetURL', -'golem','Golem', -'gougou','GouGou', -'grapnel','Grapnel/0.01 Experiment', -'griffon','Griffon', -'gromit','Gromit', -'gulperbot','Gulper Bot', -'hambot','HamBot', -'havindex','havIndex', -'hometown','Hometown Spider Pro', -'htmlgobble','HTMLgobble', -'hyperdecontextualizer','Hyper-Decontextualizer', -'iajabot','iajaBot', -'iaskspider','Sina Iask Spider', -'hl_ftien_spider','Hylanda', -'sogou','Sogou Spider', -'icjobs\.de', 'iCjobs Spider Note: Most traffic counts as user traffic', -#20130805 The user agent string of the icjobs-spider contained the -#identifying string only when it accessed the robots.txt file. -#When it accessed the actual content it did not identify itself as -#a spider. Thus traffic of this spider was counted as user traffic. -#The behavious seems to have changed now - the spider identifies itself -#when it accesses content pages. -#20141401 Behavior as before: Does identify itself when it accesses -# robots.txt and the root page. The following traffic does not contain -# the identification string and is therefore counted as user traffic. -'iconoclast','Popular Iconoclast', -'ilse','Ingrid', -'imagelock','Imagelock', -'incywincy','IncyWincy', -'informant','Informant', -'infoseek','InfoSeek Robot 1.0', -'infoseeksidewinder','Infoseek Sidewinder', -'infospider','InfoSpiders', -'inspectorwww','Inspector Web', -'intelliagent','IntelliAgent', -'ips\-agent', 'ips-agent Verisign(?) - no reliable information found.', -'irobot','I, Robot', -'iron33','Iron33', -'israelisearch','Israeli-search', -'javabee','JavaBee', -'jbot','JBot Java Web Robot', -'jcrawler','JCrawler', -'jobo','JoBo Java Web Robot', -'jobot','Jobot', -'joebot','JoeBot', -'jubii','The Jubii Indexing Robot', -'jumpstation','JumpStation', -'kapsi','image.kapsi.net', -'katipo','Katipo', -'kilroy','Kilroy', -'ko[_+\s]yappo[_+\s]robot','KO_Yappo_Robot', -'kummhttp','KummHttp', -'labelgrabber\.txt','LabelGrabber', -'larbin','larbin', -'legs','legs', -'linkidator','Link Validator', -'linkscan','LinkScan', -'lockon','Lockon', -'logo_gif','logo.gif Crawler', -'macworm','Mac WWWWorm', -'lmspider','lmspider', -'lwp\-request','lwp-request', -'lwp\-trivial','lwp-trivial', -'magpie','MagpieRSS', -'marvin','marvin/infoseek', -'mattie','Mattie', -'mediafox','MediaFox', -'merzscope','MerzScope', -'meshexplorer','NEC-MeshExplorer', -'mindcrawler','MindCrawler', -'mnogosearch','mnoGoSearch search engine software', -'momspider','MOMspider', -'monster','Monster', -'motor','Motor', -'muncher','Muncher', -'mwdsearch','Mwd.Search', -'ndspider','NDSpider', -'nederland\.zoek','Nederland.zoek', -'netcarta','NetCarta WebMap Engine', -'netmechanic','NetMechanic', -'netscoop','NetScoop', -'newscan\-online','newscan-online', -'nhse','NHSE Web Forager', -'northstar','The NorthStar Robot', -'nzexplorer','nzexplorer', -'objectssearch','ObjectsSearch', -'occam','Occam', -'octopus','HKU WWW Octopus', -'openfind','Openfind data gatherer', -'orb_search','Orb Search', -'packrat','Pack Rat', -'pageboy','PageBoy', -'parasite','ParaSite', -'patric','Patric', -'pegasus','pegasus', -'perignator','The Peregrinator', -'perlcrawler','PerlCrawler 1.0', -'phantom','Phantom', -'phpdig','PhpDig', -'piltdownman','PiltdownMan', -'pimptrain','Pimptrain.com\'s robot', -'pioneer','Pioneer', -'pitkow','html_analyzer', -'pjspider','Portal Juice Spider', -'plumtreewebaccessor','PlumtreeWebAccessor', -'poppi','Poppi', -'portalb','PortalB Spider', -'psbot','psbot', -'python','Python-urllib', -'raven','Raven Search', -'rbse','RBSE Spider', -'resumerobot','Resume Robot', -'rhcs','RoadHouse Crawling System', -'road_runner','Road Runner: The ImageScape Robot', -'robbie','Robbie the Robot', -'robi','ComputingSite Robi/1.0', -'robocrawl','RoboCrawl Spider', -'robofox','RoboFox', -'robozilla','Robozilla', -'roverbot','Roverbot', -'rules','RuLeS', -'safetynetrobot','SafetyNet Robot', -'semalt', 'seamalt.com', -'search\-info','Sleek', -'search_au','Search.Aus-AU.COM', -'searchprocess','SearchProcess', -'senrigan','Senrigan', -'sgscout','SG-Scout', -'shaggy','ShagSeeker', -'shaihulud','Shai\'Hulud', -'sift','Sift', -'simbot','Simmany Robot Ver1.0', -'sistrix', 'SISTRIX Crawler', -'site\-valet','Site Valet', -'sitetech','SiteTech-Rover', -'skymob','Skymob.com', -'slcrawler','SLCrawler', -'smartspider','Smart Spider', -'snooper','Snooper', -'solbot','Solbot', -'speedy','Speedy Spider', -'spider[_+\s]monkey','Spider monkey', -'spiderbot','SpiderBot', -'spiderline','Spiderline Crawler', -'spiderlytics', 'Spiderlytics: No homepage, e-mail only: spider (at) spiderlytics.com', -'spiderman','Spiderman', -'spiderview','SpiderView(tm)', -'spry','Spry Wizard Robot', -'ssearcher','Site Searcher', -'sqworm','Sqworm', -'suke','Suke', -'sunrise','Sunrise', -'suntek','suntek search engine', -'sven','Sven', -'tach_bw','TACH Black Widow', -'tagyu_agent','Tagyu Agent', -'tarantula','Tarantula', -'tarspider','tarspider', -'tailrank','TailRank', -'techbot','TechBOT', -'templeton','Templeton', -'titan','TITAN', -'titin','TitIn', -'tkwww','The TkWWW Robot', -'tlspider','TLSpider', -'ucsd','UCSD Crawl', -'udmsearch','UdmSearch', -'universalfeedparser','UniversalFeedParser', -'urlck','URL Check', -'valkyrie','Valkyrie', -'verticrawl','Verticrawl', -'victoria','Victoria', -'visionsearch','vision-search', -'voidbot','void-bot', -'vwbot','VWbot', -'w3index','The NWI Robot', -'w3m2','W3M2', -'wallpaper','WallPaper (alias crawlpaper)', -'wanderer','the World Wide Web Wanderer', -'wapspider','w@pSpider by wap4.com', -'webbandit','WebBandit Web Spider', -'webcatcher','WebCatcher', -'webcopy','WebCopy', -'webfetcher','webfetcher', -'webfoot','The Webfoot Robot', -'webinator','Webinator', -'weblinker','WebLinker', -'webmirror','WebMirror', -'webmoose','The Web Moose', -'webquest','WebQuest', -'webreader','Digimarc MarcSpider', -'webreaper','WebReaper', -'websnarf','Websnarf', -'webspider','WebSpider', -'webvac','WebVac', -'webwalk','webwalk', -'webwalker','WebWalker', -'webwatch','WebWatch', -'whatuseek','whatUseek Winona', -'whowhere','WhoWhere Robot', -'wired\-digital','Wired Digital', -'wmir','w3mir', -'wolp','WebStolperer', -'wombat','The Web Wombat', -'wordpress','WordPress', -'worm','The World Wide Web Worm', -'woozweb','Woozweb Monitoring', -'wwwc','WWWC Ver 0.2.5', -'wz101','WebZinger', -'xenu\slink\ssleuth', 'Xenu'. "'" . 's Link Sleuth (TM), see Wikipedia', -'xget','XGET', -# Other robots reported by users -'^finbot', 'finbot', -'^webindex$', 'WebIndex', -'1\-more_scanner','1-More Scanner', -'360spider','360spider', -'a6-indexer', 'A6-Indexer', -'accoona\-ai\-agent','Accoona-AI-Agent', -'activebookmark','ActiveBookmark', -'adamm_bot','AdamM Bot', -'adsbot-google', 'AdsBot-Google', -'advbot', 'AdvBot', -'affectv\.co\.uk', 'affectv.co.uk', -'almaden','IBM Almaden Research Center WebFountain™', -'aipbot','aipbot', -'aleadsoftbot','ALeadSoftbot', -'alpha_search_agent','Alpha Search Agent', -'allrati','Allrati', -'aport', 'Aport', -'applebot', 'Applebot', -'archive\-de\.com', 'Archive-de.com', -'archive\.org_bot','archive.org bot', -'argus','Argus', -'arianna\.libero\.it','arianna.libero.it', -'aspseek','ASPseek', -'asterias', 'Asterias', -'awbot', 'AWBot', -'backlinktest\.com', 'BacklinkCrawler', -'baiduspider','BaiDuSpider', -'becomebot', 'BecomeBot', -'bender','bender focused_crawler', -'betabot','BetaBot', -'biglotron','Biglotron', -'bittorrent_bot','BitTorrent Bot', -'biz360[_+\s]spider','Biz360 spider', -'blexbot', 'BLEXBot, seems to belong to the WebMeUp backlink tool', -'blogbridge[_+\s]service','BlogBridge Service', -'bloglines','Bloglines', -'blogpulse','BlogPulse ISSpider intelliseek.com', -'blogsearch','BlogSearch', -'blogshares','Blogshares Spiders', -'blogslive','Blogslive', -'blogssay','BlogsSay :: RSS Search Crawler', -'bncf\.firenze\.sbn\.it\/raccolta\.txt','Biblioteca Nazionale Centrale di Firenze', -'bobby', 'Bobby', -'boitho\.com\-dc','boitho.com-dc', -'bookmark\-manager','Bookmark-Manager', -'boris', 'Boris', -'bubing', 'BUbiNG', -'bumblebee', 'Bumblebee (relevare.com)', -'candlelight[_+\s]favorites[_+\s]inspector','Candlelight_Favorites_Inspector', -'careerbot', 'CareerBot', -'cbn00glebot','cbn00glebot', -'ccbot', 'Common Crawl', -'cerberian_drtrs','Cerberian Drtrs', -'cfnetwork','CFNetwork', -'cipinetbot','CipinetBot', -'checkweb_link_validator','CheckWeb link validator', -'cliqzbot', 'Cliqzbot', -'commons\-httpclient','Jakarta commons-httpclient', -'computer_and_automation_research_institute_crawler','Computer and Automation Research Institute Crawler', -'converamultimediacrawler','ConveraMultiMediaCrawler', -'converacrawler','ConveraCrawler', -'copubbot', 'CoPubbot', -'cscrawler','CsCrawler', -'cse_html_validator_lite_online','CSE HTML Validator Lite Online','cuasarbot','Cuasarbot', -'cursor','Cursor', -'custo','Custo', -'datafountains\/dmoz_downloader','DataFountains/DMOZ Downloader', -'dataprovider\.com', 'Dataprovider Site Explorer', -'daumoa', 'Daum', -'daviesbot', 'DaviesBot', -'daypopbot', 'DayPop', -'deepindex','Deepindex', -'deusu', 'DeuSu', -'dipsie\.bot','Dipsie', -'dnsgroup','DNSGroup', -'doccheckbot', 'doccheckbot/1.0, known to Project Honey Pot', -'domainchecker','DomainChecker', -'domainsdb\.net','DomainsDB.net', -'dotbot', 'DotBot, Open Site Explorer', -'duckduckgo-favicons-bot', 'DuckDuckGo-Favicons-Bot', -'dulance','Dulance', -'dumbot','Dumbot', -'dumm\.de\-bot','dumm.de-Bot', -'earthcom\.info','EARTHCOM.info', -'easydl','EasyDL', -'eccp', 'Eniro Sverige, email: search (at) eniro.com', -'edgeio\-retriever','edgeio-retriever', -'ernst[:blank:]2\.0', 'Ernst 2.0 (does not provide any further information)', -'ets_v','ETS Enterprise Translation Server', -'exactseek','ExactSeek Crawler', -'extreme[_+\s]picture[_+\s]finder','Extreme_Picture_Finder', -'eventax','eventax', -'everbeecrawler','EverbeeCrawler', -'everest\-vulcan','Everest-Vulcan', -'ezresult', 'Ezresult', -'enteprise','Fast Enteprise Crawler', -'facebook','FaceBook bot', -'facebot', 'Facebot (Facebook bot?)', -'fast\-search\-engine','Fast-Search-Engine (not fastsearch.com)', -'fast_enterprise_crawler','FAST Enterprise Crawler', -'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de','FAST Enterprise Crawler * crawleradmin.t-info@telekom.de', -'finderlein[_+\s]research[_+\s]crawler', 'Finderlein Research Crawler 1.0 (no contact information given)', -'matrix_s\.p\.a\._\-_fast_enterprise_crawler','Matrix S.p.A. - FAST Enterprise Crawler', -'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de','FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de', -'fastbot', 'fastbot', -'favicon','FavIconizer', -'favorg','FavOrg', -'favorites_sweeper','Favorites Sweeper', -'feedburner', 'Feedburner', -'feedfetcher\-google','Feedfetcher-Google', -'feedflow','FeedFlow', -'feedster','Feedster', -'feedsky','FeedSky', -'feedvalidator','FeedValidator', -'fetchbot', 'Fetchbot', -'filmkamerabot','FilmkameraBot', -'filterdb\.iss\.net', 'oBot', -'findexa_crawler','Findexa Crawler', -'firmilybot', 'Firmily Bot Home page (Website was hacked on Oct. 19, 2013)', -'findlinks','Findlinks', -'foaf-search\.net', 'Friend of a friend (FOAF) search engine', -'fooky\.com\/ScorpionBot','Fooky.com/ScorpionBot/ScoutOut', -'g2crawler','G2Crawler', -'gaisbot','Gaisbot', -'geniebot','Geniebot', -'genieo', 'Genieo', -'gigablastopensource', 'GigablastOpenSource, an Open Source Search Engine(Wiki)', -'gigabot','GigaBot', -'girafabot','Girafabot', -'global_fetch','Global Fetch', -'gnodspider','GNOD Spider', -'goforit\.com','GoForIt.com', -'goforitbot','GOFORITBOT', -'gonzo','suchen.de', -'gpu_p2p_crawler','GPU p2p crawler', -'grapeshot', 'Grapeshot Crawler', -'grub','Grub.org', -'henrythemiragorobot', 'Mirago', -'heritrix','Heritrix', -'holmes', 'Holmes', -'hoowwwer','HooWWWer', -'hpprint','HPPrint', -'htmlparser','HTMLParser', -'html[_+\s]link[_+\s]validator','Html_Link_Validator', -'httrack','HTTrack off-line browser', -'hundesuche\.com\-bot','Hundesuche.com-Bot', -'i-bot','i-bot', -'icarus6j', 'Icarus6j, email address in UA string, no website', -'ichiro','ichiro', -'idmarch', 'IDMARCH', -'iltrovatore\-setaccio','IlTrovatore-Setaccio', -'implisensebot', 'ImplisenseBot', -'infobot','InfoBot', -'infociousbot','InfociousBot', -'infohelfer','Infohelfer', -'infomine','INFOMINE VLCrawler', -'insurancobot','InsurancoBot', -'integromedb\.org','IntegromeDB', -'internet[_+\s]ninja','Internet_Ninja ', -'internetarchive','InternetArchive', -'internetseer', 'InternetSeer', -'internetsupervision','InternetSupervision', -'irlbot','IRLbot', -'isearch2006','isearch2006', -'istellabot', 'IstellaBot', -'iupui_research_bot','IUPUI_Research_Bot', -'izsearch', 'iZSearch', -'james\sbot', 'James BOT', -'jobboerse', 'Jobbörse', -'jrtwine[_+\s]software[_+\s]check[_+\s]favorites[_+\s]utility','JRTwine_Software_Check_Favorites_Utility', -'justview', 'JustView', -'kalambot','KalamBot', -'kamano\.de_newsfeedverzeichnis','kamano.de NewsFeedVerzeichnis', -'kazoombot','KazoomBot', -'kevin','Kevin', -'keyoshid','Yahoo! Japan keyoshid robot study', -'kinjabot', 'Kinjabot', -'kinja\-imagebot', 'Kinja Imagebot', -'knowitall','KnowItAll', -'knowledge\.com','Knowledge.com', -'kouaa_krawler','Kouaa Krawler', -'krugle','Krugle', -'ksibot','ksibot', -'kurzor','Kurzor', -'lanshanbot','lanshanbot', -'letscrawl\.com','LetsCrawl.com', -'libcrawl','Crawl libcrawl', -'link_valet_online','Link Valet Online', -'linkbot','LinkBot', -'linkdex\.com', 'Linkdex', -'linkchecker','LinkChecker', -'linkstats\sbot', 'LinkStats Bot', -'lipperhey', 'Lipperhey SEO Service', -'livejournal\.com', 'LiveJournal.com', -'loadtimebot', 'LoadTimeBot', -'lssrocketcrawler', 'LSSRocketCrawler (no contact information)', -'ltbot', 'Language Tools Bot (ltbot)', -'ltx71', 'ltx71', -'madaali\.de', 'www.madaali.de', -'magpierss', 'MagpieRSS', -'mail\.ru', 'Mail.ru bot', -'mapoftheinternet\.com','MapoftheInternet.com', -'meanpathbot', 'Meanpathbot', -'mediabot', 'MediaBot', -'mediapartners\-google','Google AdSense', -# 'Mediapartners-Google (Feb 12, 2015: no additial information in UA String, seems to use GigablastOpenSource', -# Uses UA string "Mediapartners-Google" only, and there were accesses using an UA string "GigablastOpenSource/1.0" from the same IP-Address. -# Therefore this is probably not related to Google 4.3.2015 Albrecht M眉ller -'megaindex', 'MegaIndex Crawler, seems to belong to MegaIndex.ru', -'megite','Megite', -'memorybot', 'Archivethe.net', -'metager2-verification-bot', 'metager2-verification-bot', -'metager\-linkchecker','MetaGer LinkChecker', -'metajobbot', 'MetaJobBot', -'metaspinner','Metaspinner', -'miadev', 'MiaDev spider', -'microsoft\sbits', 'Microsoft Background Intelligent Transfer Service (BITS)?', -'microsoft.*discovery', 'Microsoft Office Protocol Discovery/Microsoft Office Existence Discovery', -'microsoft[_+\s]url[_+\s]control','Microsoft URL Control', -'mindupbot', 'mindUpBot (datenbutler.de)', -'minirank','miniRank', -'mini\-reptile','Mini-reptile', -'missigua_locator','Missigua_Locator', -'misterbot','Misterbot', -'miva','Miva', -'mizzu_labs','Mizzu Labs', -'mj12bot','MJ12bot', -'mojeekbot','MojeekBot', -'msiecrawler','MSIECrawler', -'ms[_+\s]search[_+\s]6\.0[_+\s]robot','MS Search 6.0 Robot (MS SharePoint Portal Server?)', -'ms_search_4\.0_robot','MS SharePoint Portal Server - MS Search 4.0 Robot', -'msrabot','msrabot', -'msrbot','MSRBOT', -'mt::telegraph::agent','MT::Telegraph::Agent', -'mydoyouhike','Mydoyouhike', -'nagios','Nagios', -'nasa_search','NASA Search', -'netestate\sne\scrawler','Website-Datenbank', -'netluchs','Netluchs', -'netsprint','NetSprint', -'newsgatoronline', 'NewsGator Online', -'nicebot','nicebot', -'nimblecrawler','NimbleCrawler', -'noxtrumbot','noxtrumbot', -'npbot','NPBot', -'loocalcrawler/nutch', 'LoocalCrawler/Nutch', -'nutchcvs','NutchCVS', -'nutchosu\-vlib','NutchOSU-VLIB', -'nutch','Nutch', -'ocelli','Ocelli', -'octora_beta_bot','Octora Beta Bot', -'omniexplorer[_+\s]bot','OmniExplorer Bot', -'onet\.pl[_+\s]sa','Onet.pl_SA', -'onfolio','Onfolio', -'opentaggerbot','OpenTaggerBot', -'openwebspider','OpenWebSpider', -'optimizer', 'Optimizer', -'oracle_ultra_search','Oracle Ultra Search', -'orangebot', 'OrangeBot, no website, log entry specifies mail address', # support.orangebot@orange.com -'orbiter','Orbiter', -'yodaobot','OutfoxBot/YodaoBot', -'qihoobot','QihooBot', -'qwantify', 'Qwant', -'passwordmaker\.org','passwordmaker.org', -'pear_http_request_class','PEAR HTTP Request class', -'peerbot','PEERbot', -'perman', 'Perman surfer', -'php[_+\s]version[_+\s]tracker','PHP version tracker', -'phpcrawl', 'PHPCrawl', -'picmole', 'Specified address www.picmole.com was not reachable on April 21, 2014', -'pictureofinternet','PictureOfInternet', -'ping\.blo\.gs','ping.blo.gs', -'plinki','plinki', -'pluckfeedcrawler','PluckFeedCrawler', -'plukkie', 'Plukkie', -'pogodak','Pogodak.com', -'pompos','Pompos', -'popdexter','Popdexter', -'port_huron_labs','Port Huron Labs', -'postfavorites','PostFavorites', -'projectwf\-java\-test\-crawler','ProjectWF-java-test-crawler', -'proodlebot','proodleBot', -'publiclibraryarchive', 'publiclibraryarchive.org (related to spiderlytics.com and/or waybackarchive.org?)', -#Observations 2014-06-23 -#Domain publiclibraryarchive.org is parked at GoDaddy.com -#from https://www.projecthoneypot.org/ -#81.30.151.220's User Agent Strings (honeypot classified this ip as an mail server, active about 6 years ago) -#Mozilla/5.0 (compatible; publiclibraryarchive.org/1.0; +crawl@publiclibraryarchive.org) -#176.9.138.27's User Agent Strings -#Mozilla/5.0 (compatible; publiclibraryarchive.org/1.0; +crawl@publiclibraryarchive.org) -#Mozilla/5.0 (compatible; Spiderlytics/1.0; +spider@spiderlytics.com) -#Mozilla/5.0 (compatible; waybackarchive.org/1.0; +spider@waybackarchive.org) -#146.0.32.165's User Agent Strings -#Mozilla/5.0 (compatible; publiclibraryarchive.org/1.0; +crawl@publiclibraryarchive.org) -#Mozilla/5.0 (compatible; savetheworldheritage.org/1.0; +crawl@savetheworldheritage.org) -#Mozilla/5.0 (compatible; seoscanners.net/1; +spider@seoscanners.net) -'pyquery','PyQuery', -'rambler','StackRambler', -'redalert','Red Alert', -'relevantnoise\.com', 'Relevant Noise', -'riddler', 'Riddler', -'rogerbot', 'Rogerbot', -'rojo','RoJo aggregator', -'rssimagesbot','rssImagesBot', -'ruffle','ruffle SemanticWeb crawler', -'rufusbot','RufusBot Rufus Web Miner', -'safeads\.xyz', 'SafeAds.xyz', -'safesearch', 'Avira SafeSearch', -'sandcrawler','SandCrawler (Microsoft)', -'savetheworldheritage', 'savetheworldheritage.org (related to spiderlytics.com, waybackarchive.org and/or publiclibraryarchive.org?)', -'sbider','SBIder', -'schizozilla','Schizozilla', -'scumbot','Scumbot', -'searchguild[_+\s]dmoz[_+\s]experiment','SearchGuild_DMOZ_Experiment', -'searchmetricsbot','SearchmetricsBot', -'seekbot','Seekbot', -'semrushbot', 'SemrushBot', -'sensis_web_crawler','Sensis Web Crawler', -'seodiver', 'SEO DIVER', -'seokicks\.de', 'SEOkicks Webcrawler', -'seoscanners', 'seoscanners.net (related to publiclibraryarchive.org and savetheworldheritage.org?)', -'seznambot','SeznamBot', -'shim\-crawler','Shim-Crawler', -'shoutcast','Shoutcast Directory Service', -'sitedomain-bot', 'Sitedomain.de', -'siteexplorer\.info', 'Site Explorer', -'skimbot', 'SkimBot', -'slysearch','SlySearch', -'smtbot', 'SMTBot', -'snap\.com_beta_crawler','snap.com beta crawler', -'sohu\-search','sohu-search', -'sohu','sohu agent', -'snappy','Snappy', -'spbot', 'SEOprofiler Bot', -'sphere_scout','Sphere Scout', -'spip','SPIP', -'sproose_crawler','sproose crawler', -'ssearch_bot', 'sSearch Crawler', -'steroid__download','STEROID Download', -'steeler','Steeler', -'stq_bot', 'SEARCHTEQ', -'suchfin\-bot','Suchfin-Bot', -'superbot','SuperBot', -'surveybot','SurveyBot', -'susie','Susie', -'syndic8','Syndic8', -'syndicapi','SyndicAPI', -'synoobot','SynooBot', -'tcl_http_client_package','Tcl http client package', -'technoratibot', 'Technoratibot', -'teragramcrawlersurf','TeragramCrawlerSURF', -'test_crawler','Test Crawler', -'testbot','TestBot', -'thumbsniper', 'ThumbSniper', -'t\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e','T-H-U-N-D-E-R-S-T-O-N-E', -'topicblogs', 'topicblogs', -'turnitinbot', 'Turn It In', -'turtle', 'Turtle', -'turtlescanner', 'Turtle', -'tutorgigbot','TutorGigBot', -'twiceler','twiceler', -'ubicrawler','UbiCrawler', -'ultraseek', 'Ultraseek', -'unchaos_bot_hybrid_web_search_engine','UnChaos Bot Hybrid Web Search Engine', -'unido\-bot','unido-bot', -'unisterbot', 'UnisterBot; E-Mail only: crawler (at) unister.de', -'updated','updated', -'ustc\-semantic\-group','USTC-Semantic-Group', -'vagabondo\-wap','Vagabondo-WAP', -'vagabondo','Vagabondo', -'vebidoobot', 'vebidoobot', -'vermut','Vermut', -'versus_crawler_from_eda\.baykan@epfl\.ch','versus crawler from eda.baykan@epfl.ch', -'vespa_crawler','Vespa Crawler', -'voltron', 'voltron', -'vortex','VORTEX', -'vse\/','VSE', -'w3c\-checklink','W3C Link Checker', -'w3c[_+\s]css[_+\s]validator[_+\s]jfouffa', 'W3C jigsaw CSS Validator', -'w3c_validator','W3C Validator', -'watchmouse', 'WatchMouse Website Monitor', -'wavefire','Wavefire', -'waybackarchive\.org', 'No website, email: spider(at)waybackarchive.org', -# 2.12.2013 Project Honeypot reports at least one of the IPs used by waybackarchive with a spiderlytics UA string. -# Problably not related to the wayback machine of archive.org. -'wbsearchbot', 'WBSearchBot', -'webclipping\.com', 'WebClipping.com', -'webcompass', 'webcompass', -'webcrawl\.net','webcrawl.net', -'web_downloader','Web Downloader', -'webdup','Webdup', -'webfilter','WebFilter', -'webindexer','WebIndexer', -'webminer','WebMiner', -'website[_+\s]monitoring[_+\s]bot','Website_Monitoring_Bot', -'webvulncrawl', 'WebVulnCrawl', -'wells_search','Wells Search', -'wer-liefert-was', 'Wer-liefert-was Crawler Note: AWStats counts most traffic as user traffic', -'wesee:search', 'WeSEE Bot', -'wevikabot', 'WeViKa', -'wonderer', 'Web Wombat Redback Spider', -'wotbox', 'Wotbox', -'wume_crawler','wume crawler', -'wwweasel',,'WWWeasel', -'xenu\'s_link_sleuth','Xenu Link Sleuth', -'xenu_link_sleuth','Xenu Link Sleuth', -'xirq','xirq', -'xovibot', 'XoviBot', -'y!j', 'Y!J Yahoo Japan', -'yacy', 'YaCy', -'yahoo\-blogs','Yahoo-Blogs', -'yahoo\-verticalcrawler', 'Yahoo Vertical Crawler', -'yahoofeedseeker', 'Yahoo Feed Seeker', -'yahooseeker\-testing', 'YahooSeeker-Testing', -'yahooseeker', 'YahooSeeker Yahoo! Blog crawler', -'yahoo\-mmcrawler', 'Yahoo-MMCrawler', -'yahoo!_mindset','Yahoo! Mindset', -'yandex', 'Yandex Bot', -'flexum', 'Flexum Search Engine', -'yanga', 'Yanga WorldSearch Bot', -'yet-another-spider','Yet-Another-Spider', -'yisouspider', 'YisouSpider (no additional information in UA string)', -'yooglifetchagent','yoogliFetchAgent', -'z\-add_link_checker','Z-Add Link Checker', -'zealbot','ZealBot', -'zhuaxia','ZhuaXia', -'zspider','zspider', -'zeus','Zeus Webster Pro', -'zumbot','ZumBot', -'ng\/1\.','NG 1.x (Exalead)', # put at end to avoid false positive -'ng\/2\.','NG 2.x (Exalead)', # put at end to avoid false positive -'exabot','Exabot', # put at end to avoid false positive -# Other id that are 99% of robots -'wget','WGet tools', -'libwww','Perl tool', -'^java\/[0-9]','Java (Often spam bot)', # put at end to avoid false positive -# Generic robot -'robot', 'Unknown robot (identified by \'robot\')', -'checker', 'Unknown robot (identified by \'checker\')', -'crawl', 'Unknown robot (identified by \'crawl\')', -'discovery', 'Unknown robot (identified by \'discovery\')', -'hunter', 'Unknown robot (identified by \'hunter\')', -'scanner', 'Unknown robot (identified by \'scanner\')', -'spider', 'Unknown robot (identified by \'spider\')', -'sucker', 'Unknown robot (identified by \'sucker\')', -'bot[\s_+:,\.\;\/\\\-]', 'Unknown robot (identified by \'bot\' followed by a space or one of the following characters _+:,.;/\-)', -'[\s_+:,\.\;\/\\\-]bot', 'Unknown robot (identified by a space or one of the characters _+:,.;/\- followed by \'bot\')', -'curl', 'Common *nix tool for automating web document retrieval. Most likely a bot.', -'php', 'A PHP script', -'ruby\/', 'Ruby script', -# Additional bots found by Sussex. -'^[1-3]$', 'Generic bot identified as "1", "2" or "3"', -'alltop', 'alltop', -'applesyndication', 'applesyndication', -'asynchttpclient', 'asynchttpclient', -'bingbot', 'Bingbot', -'blogged_crawl', 'blogged_crawl', -'bloglovin', 'bloglovin', -'butterfly', 'butterfly', -'buzztracker', 'buzztracker', -'carpathia', 'carpathia', -'catbot', 'catbot', -'chattertrap', 'chattertrap', -'check_http', 'check_http (nagios)', -'coldfusion', 'coldfusion', -'covario', 'covario', -'daylifefeedfetcher', 'daylifefeedfetcher', -'discobot', 'discobot', -'dlvr\.it', 'dlvr.it', -'dreamwidth', 'dreamwidth', -'drupal', 'Drupal Site', -'ezoom', 'ezoom', -'feedmyinbox', 'feedmyinbox', -'feedroll\.com', 'feedroll.com', -'feedzira', 'feedzira', -'fever\/', 'Feed a Fever', -'freenews', 'freenews', -'geohasher', 'geohasher', -'hanrss', 'hanrss', -'inagist', 'inagist', -'jacobin\sclub', 'jacobin club', -'jakarta', 'jakarta', -'js\-kit', 'js-kit', -'largesmall\scrawler', 'largesmall crawler', -'linkedinbot', 'linkedinbot', -'longurl', 'longurl', -'metauri', 'metauri', -'microsoft\-webdav\-miniredir', 'microsoft-webdav-miniredir', -'^motorola$', 'Suspected Bot masquerading as "Motorola"', -'movabletype', 'movabletype', -'^mozilla\/3\.0\s\(compatible$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/4\.0$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/4\.0\s\(compatible;\)$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/5\.0$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/5\.0\s\(compatible;$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/5\.0\s\(en\-us\)$', 'Suspected bot masqurading as Mozilla', -'^mozilla\/5\.0\sfirefox\/3\.0\.5$', 'Suspected bot masqurading as Mozilla', -'^msie', 'Suspected bot masquerading as M$ IE', -'netnewswire', 'netnewswire', -'\snetseer\s', 'Net Seer', -'netvibes', 'netvibes', -'newrelicpinger', 'newrelicpinger', -'newsfox', 'Fox News', -'nextgensearchbot', 'nextgensearchbot', -'ning', 'ning', -'pingdom', 'pingdom', -'pita', 'pita (pain in the ass?)', -'postpost', 'postpost', -'postrank', 'postrank', -'printfulbot', 'printfulbot', -'protopage', 'protopage', -'proximic', 'Proximic Spider', -'quipply', 'quipply', -'r6\_', 'Radian 6 Crawler', -'ratingburner', 'ratingburner', -'regator', 'regator', -'rome\sclient', 'rome client', -'rpt\-httpclient', 'rpt-httpclient', -'rssgraffiti', 'rssgraffiti', -'sage\+\+', 'sage++', -'scoutjet', 'ScoutJet crawler for Blekko.', -'simplepie', 'simplepie', -'sitebot', 'sitebot', -'summify\.com', 'summify.com', -'superfeedr', 'superfeedr', -'synthesio', 'synthesio', -'teoma', 'teoma', -'topblogsinfo', 'topblogsinfo', -'topix\.net', 'topix.net', -'trapit', 'trapit', -'trileet', 'trileet', -'tweetedtimes', 'The Tweeted Times', -'twisted\spagegetter', 'twisted pagegetter', -'twitterbot', 'Twitterbot', -'twitterfeed', 'twitterfeed', -'unwindfetchor', 'unwindfetchor', -'wazzup', 'wazzup', -'windows\-rss\-platform', 'windows-rss-platform', -'wiumi', 'wiumi', -'xydo', 'xydo', -'yahoo!\sslurp', 'Additional Yahoo bots.', -'yahoo\spipes', 'Additional Yahoo bots.', -'yahoo\-newscrawler', 'Additional Yahoo bots.', -'yahoocachesystem', 'Additional Yahoo bots.', -'yahooexternalcache', 'Additional Yahoo bots.', -'yahoo!\ssearchmonkey', 'Additional Yahoo bots.', -'yahooysmcm', 'Additional Yahoo bots.', -'yammer', 'yammer', -#'yandexbot', 'yandexbot', #already covered by 'yandex' -'yeti', 'yeti', -'yie8', 'yie8', -'youdao', 'youdao', -'yourls', 'yourls', -'zemanta', 'zemanta', -'zend_http_client', 'Zend Http Client', -'no_user_agent','Unknown robot (identified by empty user agent string)', -# Unknown robots identified by hit on robots.txt -'unknown', 'Unknown robot (identified by hit on \'robots.txt\')' -); - - -# RobotsAffiliateLib -# This list try to tell by which Search Engine a robot is used -#------------------------------------------------------------- -%RobotsAffiliateLib = ( -'bingpreview'=>'Bing', -'fast\-webcrawler'=>'AllTheWeb', -'googlebot'=>'Google', -'google\-sitemap'=>'Google', -'google[_+\s]web[_+\s]preview'=>'Google', -'msnbot'=>'MSN', -'nutch'=>'Looksmart', -'scooter'=>'AltaVista', -'wisenutbot'=>'Looksmart', -'yahoo\-blogs'=>'Yahoo', -'yahoo\-verticalcrawler'=>'Yahoo', -'yahoofeedseeker'=>'Yahoo', -'yahooseeker\-testing'=>'Yahoo', -'yahooseeker'=>'Yahoo', -'yahoo\-mmcrawler'=>'Yahoo', -'yahoo!_mindset'=>'Yahoo', -'zyborg'=>'Looksmart', -'cfetch'=>'Kosmix', -'^voyager\/'=>'Kosmix', -# Additional bots found by Sussex. -'feedfetcher\-google'=>'Google', -'bingbot'=>'MSN', -'twitterbot'=>'Twitter', -'twitterfeed'=>'Twitter', -'yahoo!\sslurp'=>'Yahoo', -'yahoo\spipes'=>'Yahoo', -'yahoo-newscrawler'=>'Yahoo', -'yahoocachesystem'=>'Yahoo', -'yahooexternalcache'=>'Yahoo', -'yahoo!\ssearchmonkey'=>'Yahoo', -'yahooysmcm'=>'Yahoo' -); - -1; +# AWSTATS ROBOTS DATABASE +#------------------------------------------------------- +# If you want to add robots to extend AWStats database detection capabilities, +# you must add an entry in RobotsSearchIDOrder_listx and RobotsHashIDLib. + +# The entry in RobotsSearchIDOrder_listx is a Perl regular expression +# (see http://perldoc.perl.org/perlreref.html). AWSTats applies these +# expressions to the user agent string in the order given by the lists. The +# first match specifies the robot. +# +# Note: This regular expression must not contain any whitespace. +# Otherwise AWStats will produce lines in the database that +# will be misinterpreted and as a consequence the corresponding data in the +# generated HTML reports will be wrong. If you want to match whitespace in +# the user agent string, use other constructs like '\s', '[:blank:]', +# '\p{IsSpace}', '\x20' etc. +# +# The corresponding entry in RobotsHashIDLib contains the regular expression +# as key, followed by a string containing HTML-text. AWStats inserts this +# text into reports to describe the bot. If possible the text should contain +# a link to the bot home page. This makes it easier for sysadmins to find +# the information necessary e.g. to adapt the robots.txt file. +# +# An entry in the RobotsAffiliateLib is not necessary. An entry in this list +# contains as first part the regular expression specifying the bot. The +# second part is a string that gives the Company or product managing the bot. +# This information is not used yet. +# +# There are several sorts of bots that AWStats is not able to detect and +# therefore a considerable amount of bot generated traffic counts +# as user traffic: +# +# a) A crawler that identifies itself in the referrer string, but not in +# the user agent string. An example is the crawler from semalt.semalt.com. +# +# b) Crawlers that correctly access robots.txt but identify themselves in +# in the user agent string only once or just a few times. Most of the +# time a user agent string ist used that does not contain hints that +# a bot is involved. An example is the iCjobs spider. +# msnbot-UDiscovery/2.0b seems to show this behaviour too. +# +# +# +#------------------------------------------------------- + +# 2005-08-19 Sean Carlos http://www.antezeta.com/awstats.html +# added dipsie (not tested with real data). +# added DomainsDB.net http://domainsdb.net/ +# added ia_archiver-web.archive.org (was inadvertently grouped with Alexa traffic) +# added Nutch (used by looksmart (furl?)) +# added rssImagesBot +# added Sqworm +# added t\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e +# added w3c css-validator +# added documentation link to bot home pages for above and selected major bots. +# In the case of international bots, choose .com page. +# Included tool tip (html "title"). +# To do: parameterize to match both AWStats language and tooltips settings. +# To do: add html links for all bots based on current documentation in source +# files referenced below. +# changed '\wbot[\/\-]', to '\wbot[\/\-]' (removed comma) +# made minor grammar corrections to notes below +# 2005-08-24 added YahooSeeker-Testing +# added w3c-checklink +# updated url for ask.com +# 2005-08-24 added Girafabot http://www.girafa.com/ +# 2005-08-30 added PluckFeedCrawler http://www.pluck.com/ +# added Gaisbot/3.0 (robot05@gais.cs.ccu.edu.tw; ) +# dded geniebot (wgao@genieknows.com) +# added BecomeBot link http://www.become.com/site_owners.html +# added topicblogs http://www.topicblogs.com/ +# added Powermarks; seen used by referrer spam +# added YahooSeeker +# added NG/2. http://www.exabot.com/ +# 2005-09-15 added link for Walhello appie +# added bender focused_crawler +# updated YahooSeeker description (blog crawler) +# 2005-09-16 added link for http://linkchecker.sourceforge.net +# added ConveraCrawler/0.9d ( http://www.authoritativeweb.com/crawl) +# added Blogslive info@blogslive.com intelliseek.com +# added BlogPulse (ISSpider-3.0) intelliseek.com +# 2005-09-26 added Feedfetcher-Google (http://www.google.com/feedfetcher.html) +# added EverbeeCrawler +# added Yahoo-Blogs http://help.yahoo.com/help/us/ysearch/crawling/crawling-02.html +# added link for Bloglines http://www.bloglines.com +# 2005-10-19 fixed Feedfetcher-Google (http://www.google.com/feedfetcher.html) +# added Blogshares Spiders (Synchronized V1.5.1) +# added yacy +# 2005-11-21 added Argus www.simpy.com +# added BlogsSay :: RSS Search Crawler (http://www.blogssay.com/) +# added MJ12bot http://majestic12.co.uk/bot.php +# added OpenTaggerBot (http://www.opentagger.com/opentaggerbot.htm) +# added OutfoxBot/0.3 (For internet experiments; outfox.agent@gmail.com) +# added RufusBot Rufus Web Miner http://64.124.122.252.webaroo.com/feedback.html +# added Seekbot (http://www.seekbot.net/bot.html) +# added Yahoo-MMCrawler/3.x (mms-mmcrawler-support@yahoo-inc.com) +# added link for BaiDuSpider +# added link for Blogshares Spider +# added link for StackRambler http://www.rambler.ru/doc/faq.shtml +# added link for WISENutbot +# added link for ZyBorg/1.0 (wn-14.zyborg@looksmart.net; http://www.WISEnutbot.com. Moved location to above wisenut to avoid classification as wisenut +# 2005-12-15 +# added FAST Enteprise Crawler/6 (www dot fastsearch dot com). Note spelling Enteprise not Enterprise. +# added findlinks http://wortschatz.uni-leipzig.de/findlinks/ +# added IBM Almaden Research Center WebFountain鈩 http://www.almaden.ibm.com/cs/crawler [hc3] +# added INFOMINE/8.0 VLCrawler (http://infomine.ucr.edu/useragents) +# added lmspider (lmspider@scansoft.com) http://www.nuance.com/ +# added noxtrumbot http://www.noxtrum.com/ +# added SandCrawler (Microsoft) +# added SBIder http://www.sitesell.com/sbider.html +# added SeznamBot http://fulltext.seznam.cz/ +# added sohu-search http://corp.sohu.com/ (looked for //robots.txt not /robots.txt) +# added the ruffle SemanticWeb crawler v0.5 - http://www.unreach.net +# added WebVulnCrawl/1.0 libwww-perl/5.803 (looked for //robots.txt not /robots.txt) +# added Yahoo! Japan keyoshid http://www.yahoo.co.jp/ +# added Y!J http://help.yahoo.co.jp/help/jp/search/indexing/indexing-15.html +# added link for GigaBot +# added link for MagpieRSS +# added link for MSIECrawler +# 2005-12-21 +# added aipbot http://www.aipbot.com aipbot@aipbot.com [matthys70 users.sourceforge.net] +# added Everest-Vulcan Inc./0.1 (R&D project; http://everest.vulcan.com/crawlerhelp) +# added Fast-Search-Engine http://www.fast-search-engine.com/ [matthys70 users.sourceforge.net] +# added g2Crawler (nobody@airmail.net) http://crawler.instantnetworks.net/ +# added Jakarta commons-httpclient http://jakarta.apache.org/commons/httpclient/ (hit robots.txt). May be used as robot or browser - a site may want to remove this entry. +# added OmniExplorer_Bot http://www.omni-explorer.com/ [matthys70 users.sourceforge.net] +# added USTC-Semantic-Group ai.ustc.edu.cn/mas/en/research/index.php ? +# 2005-12-22 +# added EARTHCOM.info www.earthcom.info +# added HTTrack off-line browser 'httrack','HTTrack', http://www.httrack.com/ [Moizes Gabor] +# added KummHttp http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_g_l_301105_2\b [Moizes Gabor] +# 2006-01-01 +# added Dulance http://www.dulance.com/bot.jsp +# added MojeekBot http://www.mojeek.com/bot.html +# added nicebot http://www.egghelp.org/setup.htm ? +# added Snappy http://www.urltrends.com/faq.php +# added sohu agent +# added VORTEX http://marty.anstey.ca/robots/vortex/ [matthys70 users.sourceforge.net] +# added zspider http://feedback.redkolibri.com/ +# 2006-01-13 +# added boitho.com-dc http://www.boitho.com/dcbot.html +# added IRLbot http://irl.cs.tamu.edu/crawler +# added virus_detector virus_harvester@securecomputing.com +# added Wavefire http://www.wavefire.com; info@wavefire.com +# added WebFilter Robot +# 2006-01-24 +# added Shim-Crawler http://www.logos.ic.i.u-tokyo.ac.jp/crawler/; crawl@logos.ic.i.u-tokyo.ac.jp +# added Exabot exabot.com +# added LetsCrawl.com http://letscrawl.com +# added ichiro http://help.goo.ne.jp/door/crawlerE.html +# 2006-01-27 additional 22 robots from a list provided by Moizes Gabor +# added ALeadSoftbot http://www.aleadsoft.com/bot.htm +# added CipinetBot http://www.cipinet.com/bot.html +# added Cuasarbot http://www.cuasar.com/ +# added Dumbot http://www.dumbfind.com/ +# added Extreme_Picture_Finder http://www.exisoftware.com/ +# added Fooky.com/ScorpionBot/ScoutOut http://www.fooky.com/scorpionbots +# added IlTrovatore-Setaccio http://www.iltrovatore.it/aiuto/motore_di_ricerca.html bot@iltrovatore.it +# added InsurancoBot http://www.fastspywareremoval.com/ +# added InternetArchive http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org +# added KazoomBot http://www.kazoom.ca/bot.html kazoombot@kazoom.ca +# added Kurzor http://www.easymail.hu/ cursor@easymail.hu +# added NutchCVS http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org +# added NutchOSU-VLIB http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org +# added Orbiter http://www.dailyorbit.com/bot.htm +# added PHP_version_tracker http://www.nexen.net/phpversion/bot.php +# added SuperBot http://www.sparkleware.com/superbot/ +# added SynooBot http://www.synoo.de/bot.html webmaster@synoo.com +# added TestBot http://www.agbrain.com/ +# added TutorGigBot http://www.tutorgig.info/ +# added WebIndexer mailto://webindexerv1@yahoo.com +# added WebMiner http://64.124.122.252/feedback.html +# 2006-02-01 +# added heritrix https://sourceforge.net/forum/message.php?msg_id=3550202 +# added Zeus Webster Pro https://sourceforge.net/forum/message.php?msg_id=3141164 +# additional robots from a list provided by Moizes Gabor [ mojzi -a-t- free mail hu ] +# added Candlelight_Favorites_Inspector +# added DomainChecker +# added EasyDL +# added FavOrg +# added Favorites_Sweeper +# added Html_Link_Validator +# added Internet_Ninja +# added JRTwine_Software_Check_Favorites_Utility +# fixed Microsoft_URL_Control +# added miniRank +# added Missigua_Locator +# added NPBot +# added Ocelli +# added Onet.pl_SA +# added proodleBot +# added SearchGuild_DMOZ_Experiment +# added Susie +# added Website_Monitoring_Bot +# added Xenu_Link_Sleuth +# 2006-05-15 +# added ASPseek http://www.aspseek.org/ +# added AdamM Bot http://home.blic.net/adamm/ +# added archive.org_bot http://crawls.archive.org/collections/bncf/crawl.html +# added arianna.libero.it (Italian Portal/search engine) +# added Biz360 spider http://www.biz360.com +# added BlogBridge Service http://www.blogbridge.com/ +# added BlogSearch http://www.icerocket.com/ +# added libcrawl +# added edgeio-relanshanbottriever http://www.edgeio.com +# added FeedFlow http://feedflow.com/about +# added Biblioteca Nazionale Centrale di Firenze (Italian National Archive) http://www.bncf.firenze.sbn.it/raccolta.txt +# added Java catchall - used by many spam bots +# added lanshanbot http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_g_l_140406_1%5Cb +# added msnbot-media http://search.msn.com/msnbot.htm +# added MT::Telegraph::Agent +# added Netluchs http://www.netluchs.de/ (German SE bot) +# added oBot http://www.webmasterworld.com/forum11/1616.htm +# added Onfolio http://www.onfolio.com/ (IE Toolbar plugin) - hit rss feeds. +# added ping.blo.gs http://blo.gs/ping.php blog bot +# added Sphere Scout http://www.sphere.com/ +# added sproose crawler http://www.sproose.com/bot.html +# added SyndicAPI http://syndicapi.com/bot.html +# added Yahoo! Mindset http://mindset.research.yahoo.com/ +# added msrabot +# added Vagabondo & Vagabondo-WAP http://www.wise-guys.nl/Contact/index.php?botselected=webagents&lang=uk +# fixed Missigua Locator detection (Missigua_Locator -> Missigua Locator) +# changed echo to echo! to avoid conflict with the bonecho (Firefox 2.0) browser. +# This requires you to reprocess historic logs if you want EchO! to be recognized for older reports. +# 2006-05-17 +# added Alpha Search Agent # 62.152.125.60 Eurologon Srl +# added Krugle http://www.krugle.com/crawler/info.html the search engine for developers +# added Octora Beta Bot http://www.octora.com/ # Blog and Rss Search Engine +# added UbiCrawler http://law.dsi.unimi.it/ubicrawler/ +# added Yahoo! Slurp China http://misc.yahoo.com.cn/help.html +# You must reprocess old logs for the Yahoo! Slurp China bot to be detected in old reports +# 2006-05-20 +# added 1-More Scanner http://www.myzips.com/software/1-More-Scanner.phtml +# added Accoona-AI-Agent http://www.accoona.com/ +# added ActiveBookmark http://www.libmaster.com/active_bookmark.php +# added BIGLOTRON http://www.biglotron.com/robot.html +# added Bookmark-Manager http://bkm.sourceforge.net/ +# added cbn00glebot +# added Cerberian Drtrs http://www.pgts.com.au/cgi-bin/psql?robot_info=25240 +# added CFNetwork http://www.cocoadev.com/index.pl?CFNetwork +# added CheckWeb link validator http://p.duby.free.fr/chkweb.htm +# added Computer and Automation Research Institute Crawler http://www.ilab.sztaki.hu/~stamas/publications/p184-benczur.html +# added ConveraCrawler http://www.authoritativeweb.com/crawl/ +# added ConveraMultiMediaCrawler http://www.authoritativeweb.com/crawl/ +# added CSE HTML Validator Lite Online http://online.htmlvalidator.com/php/onlinevallite.php +# added Cursor http://adcenter.hu/docs/en/bot.html +# added Custo http://www.netwu.com/custo/ +# added DataFountains/DMOZ Downloader http://infomine.ucr.edu/ +# added Deepindex http://www.deepindex.net/faq.php +# added DNSGroup http://www.dnsgroup.com/ +# added DoCoMo http://www.nttdocomo.co.jp/ +# added dumm.de-Bot http://www.dumm.de/ +# added ETS v http://www.freetranslation.com/help/ +# added eventax http://www.eventax.de/ +# added FAST Enterprise Crawler * crawleradmin.t-info@telekom.de http://www.telekom.de/ +# added FAST Enterprise Crawler http://www.fast.no/ +# added FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de http://www.telekom.de/ +# added FeedValidator http://feedvalidator.org/ +# added FilmkameraBot http://www.filmkamera.at/bot.html +# added Findexa Crawler http://www.findexa.no/gulesider/article26548.ece +# added Global Fetch http://www.wesonet.com/ +# added GOFORITBOT http://www.goforit.com/about/ +# added GoForIt.com http://www.goforit.com/about/ +# added GPU p2p crawler http://gpu.sourceforge.net/search_engine.php +# added HooWWWer http://cosco.hiit.fi/search/hoowwwer/ +# added HPPrint +# added HTMLParser http://htmlparser.sourceforge.net/ +# added Hundesuche.com-Bot http://www.hundesuche.com/ +# added InfoBot http://www.infobot.org/ +# added InfociousBot http://corp.infocious.com/tech_crawler.php +# added InternetSupervision http://internetsupervision.com/ +# added isearch2006 http://www.yahoo.com.cn/ +# added IUPUI_Research_Bot http://spamhuntress.com/2005/04/25/a-mail-harvester-visits/ +# added KalamBot http://64.124.122.251/feedback.html +# added kamano.de NewsFeedVerzeichnis http://www.kamano.de/ +# added Kevin http://dznet.com/kevin/ +# added KnowItAll http://www.cs.washington.edu/research/knowitall/ +# added Knowledge.com http://www.knowledge.com/ +# added Kouaa Krawler http://www.kouaa.com/ +# added ksibot http://ego.ms.mff.cuni.cz/ +# added Link Valet Online http://www.htmlhelp.com/tools/valet/ +# added lwp-request http://search.cpan.org/~gaas/libwww-perl-5.69/bin/lwp-request +# added lwp-trivial http://search.cpan.org/src/GAAS/libwww-perl-5.805/lib/LWP/Simple.pm +# added MapoftheInternet.com http://MapoftheInternet.com/ +# added Matrix S.p.A. - FAST Enterprise Crawler http://tin.virgilio.it/ +# added Megite http://www.megite.com/ +# added Metaspinner http://index.meta-spinner.de/ +# added Mini-reptile +# added Misterbot http://www.misterbot.fr/ +# added Miva http://www.miva.com/ +# added Mizzu Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_m_141105_2\b +# added MSRBOT http://research.microsoft.com/research/sv/msrbot/ +# added MS SharePoint Portal Server - MS Search 4.0 Robot http://support.microsoft.com/default.aspx?scid=kb;en-us;284022 +# added Mydoyouhike http://www.doyouhike.net/my +# added NASA Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_140506_2\b +# added NetSprint http://www.netsprint.pl/serwis/ +# added NimbleCrawler http://www.healthline.com/ +# added OpenWebSpider http://www.openwebspider.org/ +# added Oracle Ultra Search http://www.oracle.com/technology/products/ultrasearch/index.html +# added OSSProxy http://www.marketscore.com/FAQ.Aspx +# added passwordmaker.org http://passwordmaker.org/ +# added PEAR HTTP Request class http://pear.php.net/ +# added PEERbot http://www.peerbot.com/ +# added PHP version tracker http://www.nexen.net/phpversion/bot.php +# added PictureOfInternet http://malfunction.org/poi/ +# added plinki http://www.plinki.com/ +# added Port Huron Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1133\b +# added PostFavorites http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1135\b +# added ProjectWF-java-test-crawler +# added PyQuery http://sourceforge.net/projects/pyquery/ +# added Schizozilla http://spamhuntress.com/2005/03/18/gizmo/ +# added Scumbot +# added Sensis Web Crawler http://www.sensis.com.au/ +# added snap.com beta crawler http://www.snap.com/ +# added Steeler http://www.tkl.iis.u-tokyo.ac.jp/~crawler/ +# added STEROID Download http://faqs.org.ru/progr/pascal/delphi_internet2.htm +# added Suchfin-Bot http://www.suchfin.de/ +# added Sunrise http://www.sunrisexp.com/ +# added Tagyu Agent http://www.tagyu.com/ +# added Tcl http client package http://www.tcl.tk/man/tcl8.4/TclCmd/http.htm +# added TeragramCrawlerSURF http://www.teragram.com/ +# added Test Crawler http://netp.ath.cx/ +# added UnChaos Bot Hybrid Web Search Engine http://www.unchaos.com/ +# added unido-bot http://www.unchina.org/unido/unido/our_projects/3_3.html +# added UniversalFeedParser http://feedparser.org/ (seen from md301000.inktomisearch.com) +# added updated http://www.updated.com/ +# added Vermut http://vermut.aol.com +# added versus crawler from eda.baykan@epfl.ch http://www.epfl.ch/Eindex.html +# added Vespa Crawler (Yahoo Norway?) http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_t_z_030406_1%5Cb +# added VSE http://www.vivisimo.com/ +# added webcrawl.net http://www.webcrawl.net/ +# added Web Downloader http://www.krasu.ru/soft/chuchelo/ +# added Webdup http://www.webdup.com/en/index.html +# added Wells Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_t_z_1484\b +# added WordPress http://wordpress.org/ +# added wume crawler http://wume.cse.lehigh.edu/~xiq204/crawler/ +# added Xenu's Link Sleuth (with ') +# added xirq http://www.xirq.com/ +# added yoogliFetchAgent http://www.yoogli.com/ +# added Z-Add Link Checker http://w3.z-add.co.uk/linkcheck/ +# -- fix - some robots were reported with _ where _ should have been a space. +# changed Xenu Link Sleuth +# changed microsoft[_+\s]url[_+\s]control -> microsoft_url_control +# changed favorites_sweeper -> favorites_sweeper +# -- updates +# updated AskJeeves to Ask +# 2012-06-05 Albrecht Mueller +# added Grabber from SDSC (San Diego Supercomputer Center). +# 2013-09-30 Albrecht Mueller +# AWStats probably cannot detect this bot as it identifies itself in +# the referrer field and not in the user agent string. +#92.113.100.35 - - [29/Sep/2013:17:22:46 +0200] "GET /robots.txt HTTP/1.1" 200 516 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-" +#92.113.100.35 - - [29/Sep/2013:17:22:49 +0200] "GET /tghome.htm HTTP/1.1" 200 4445 "http://extrabot.com/help/frytygativyheku.htm" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-" +#92.113.100.35 - - [29/Sep/2013:17:22:51 +0200] "GET / HTTP/1.1" 200 5467 "http://extrabot.com/help/frytygativyheku.htm" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-" + +# to do MS Search 4.0 Robot + +#package AWSROB; + + +# Robots list was found at http://www.robotstxt.org/wc/active/all.txt +# Other robots can be found at http://www.jafsoft.com/searchengines/webbots.html +# Rem: To avoid bad detection, some robot's ids were removed from this list: +# - Robots with ID of 3 letters only +# - Robots called 'webs' and 'tcl' +# Rem: directhit changed into direct_hit (its real id) +# Rem: calif changed into calif[^r] to avoid confusion between Tiscalifreenet browser +# Rem: fish changed into [^a]fish to avoid confusion between Madsafish browser +# Rem: roadrunner changed into road_runner +# Rem: lycos changed to lycos_ to avoid confusion with lycos-online browser +# Rem: voyager changed into ^voyager\/ to avoid to exclude voyager and amigavoyager browser + +# RobotsSearchIDOrder +# It contains all matching criteria to search for in log fields. This list is +# used to know in which order to search Robot IDs. +# Most frequent ones are in list1, used when LevelForRobotsDetection is 1 or more +# Minor robots are in list2, used when LevelForRobotsDetection is 2 or more +# Note: Robots IDs are in lower case, '_', ' ' and '+' are changed into '[_+\s]' and are quoted. +#------------------------------------------------------- +@RobotsSearchIDOrder_list1 = ( +# Common robots (In robot file) +'appie', +'architext', +'bingpreview', +'bjaaland', +'contentmatch', +'ferret', +'googlebot\-image', +'googlebot', +'google\-sitemaps', +'google[_+\s]web[_+\s]preview', +'grabber', +'gulliver', +'virus[_+\s]detector', # Must be before harvest +'harvest', +'htdig', +'jeeves', +'linkwalker', +'lilina', +'lycos[_+\s]', +'moget', +'muscatferret', +'myweb', +'nomad', +'scooter', +'slurp', +'^voyager\/', +'weblayers', +# Common robots (Not in robot file) +'antibot', +'bruinbot', +'digout4u', +'echo!', +'fast\-webcrawler', +'ia_archiver\-web\.archive\.org', # Must be before ia_archiver to avoid confusion with alexa +'ia_archiver', +'jennybot', +'mercator', +'netcraft', +'msnbot\-media', +'msnbot-udiscovery', +'msnbot', +'petersnews', +'relevantnoise\.com', +'unlost_web_crawler', +'voila', +'webbase', +'webcollage', +'cfetch', +'zyborg', # Must be before wisenut +'wisenutbot' +); +@RobotsSearchIDOrder_list2 = ( +# Less common robots (In robot file) +'007ac9', +'[^a]fish', +'abcdatos', +'abonti\.com', +'acme\.spider', +'ahoythehomepagefinder', +'ahrefsbot', +'alkaline', +'anthill', +'arachnophilia', +'arale', +'araneo', +'aretha', +'ariadne', +'powermarks', +'arks', +'aspider', +'atn\.txt', +'atomz', +'auresys', +'backrub', +'bbot', +'bigbrother', +'blackwidow', +'blindekuh', +'bloodhound', +'borg\-bot', +'brightnet', +'bspider', +'cactvschemistryspider', +'calif[^r]', +'cassandra', +'cgireader', +'checkbot', +'christcrawler', +'churl', +'cienciaficcion', +'cms\scrawler', +'collective', +'combine', +'conceptbot', +'coolbot', +'core', +'cosmos', +'crazywebcrawler', +'cruiser', +'cusco', +'cyberspyder', +'desertrealm', +'deweb', +'dienstspider', +'digger', +'diibot', +'direct_hit', +'dnabot', +'domainappender', +'download_express', +'dragonbot', +'dwcp', +'e\-collector', +'ebiness', +'elfinbot', +'emacs', +'emcspider', +'esther', +'evliyacelebi', +'fastcrawler', +'feedcrawl', +'fdse', +'felix', +'fetchrover', +'fido', +'finnish', +'fireball', +'fouineur', +'francoroute', +'freecrawl', +'funnelweb', +'gama', +'gazz', +'gcreep', +'getbot', +'geturl', +'golem', +'gougou', +'grapnel', +'griffon', +'gromit', +'gulperbot', +'hambot', +'havindex', +'hometown', +'htmlgobble', +'hyperdecontextualizer', +'iajabot', +'iaskspider', +'hl_ftien_spider', +'sogou', +'icjobs\.de', +'iconoclast', +'ilse', +'imagelock', +'incywincy', +'informant', +'infoseek', +'infoseeksidewinder', +'infospider', +'inspectorwww', +'intelliagent', +'irobot', +'iron33', +'israelisearch', +'javabee', +'jbot', +'jcrawler', +'jobo', +'jobot', +'joebot', +'jubii', +'jumpstation', +'kapsi', +'katipo', +'kilroy', +'ko[_+\s]yappo[_+\s]robot', +'kummhttp', +'labelgrabber\.txt', +'larbin', +'legs', +'linkidator', +'linkscan', +'lockon', +'logo_gif', +'macworm', +'magpie', +'marvin', +'mattie', +'mediafox', +'merzscope', +'meshexplorer', +'mindcrawler', +'mnogosearch', +'momspider', +'monster', +'motor', +'muncher', +'mwdsearch', +'ndspider', +'nederland\.zoek', +'netcarta', +'netmechanic', +'netscoop', +'newscan\-online', +'nhse', +'northstar', +'nzexplorer', +'objectssearch', +'occam', +'octopus', +'openfind', +'orb_search', +'packrat', +'pageboy', +'parasite', +'patric', +'pegasus', +'perignator', +'perlcrawler', +'phantom', +'phpdig', +'piltdownman', +'pimptrain', +'pioneer', +'pitkow', +'pjspider', +'plumtreewebaccessor', +'poppi', +'portalb', +'psbot', +'python', +'raven', +'rbse', +'resumerobot', +'rhcs', +'road_runner', +'robbie', +'robi', +'robocrawl', +'robofox', +'robozilla', +'roverbot', +'rules', +'safetynetrobot', +'semalt', #Note: This entry will not work as this crawler identifies itself +# in the referrer string and not in the user agent string +'search\-info', +'search_au', +'searchprocess', +'senrigan', +'sgscout', +'shaggy', +'shaihulud', +'sift', +'simbot', +'sistrix', #Virus/trojan-infection? fr-crawler, ca-crawler? See https://www.projecthoneypot.org/ip_37.59.55.128, https://www.projecthoneypot.org/ip_198.27.80.144 +'site\-valet', +'sitetech', +'skymob', +'slcrawler', +'smartspider', +'snooper', +'solbot', +'speedy', +'spider[_+\s]monkey', +'spiderbot', +'spiderline', +'spiderman', +'spiderview', +'spry', +'sqworm', +'ssearcher', +'suke', +'sunrise', +'suntek', +'sven', +'tach_bw', +'tagyu_agent', +'tailrank', +'tarantula', +'tarspider', +'techbot', +'templeton', +'titan', +'titin', +'tkwww', +'tlspider', +'ucsd', +'udmsearch', +'universalfeedparser', +'urlck', +'valkyrie', +'verticrawl', +'victoria', +'visionsearch', +'voidbot', +'vwbot', +'w3index', +'w3m2', +'wallpaper', +'wanderer', +'wapspIRLider', +'webbandit', +'webcatcher', +'webcopy', +'webfetcher', +'webfoot', +'webinator', +'weblinker', +'webmirror', +'webmoose', +'webquest', +'webreader', +'webreaper', +'websnarf', +'webspider', +'webvac', +'webwalk', +'webwalker', +'webwatch', +'whatuseek', +'whowhere', +'wired\-digital', +'wmir', +'wolp', +'wombat', +'wordpress', +'worm', +'woozweb', +'wwwc', +'wz101', +'xenu\slink\ssleuth', +'xget', +# Other robots reported by users +'^finbot', #UA string starts with "finbot", should not match "elfinbot" +'^webindex$', #UA should not match "webindexer" +'1\-more_scanner', +'360spider', +'a6-indexer', +'accoona\-ai\-agent', +'activebookmark', +'adamm_bot', +'adsbot-google', +'advbot', +'affectv\.co\.uk', +'almaden', +'aipbot', +'aleadsoftbot', +'alpha_search_agent', +'allrati', +'aport', +'applebot', +'archive\-de\.com', +'archive\.org_bot', +'argus', # Must be before nutch +'arianna\.libero\.it', +'aspseek', +'asterias', +'awbot', +'backlinktest\.com', +'baiduspider', +'becomebot', +'bender', +'betabot', +'biglotron', +'bittorrent_bot', +'biz360[_+\s]spider', +'blexbot', +'blogbridge[_+\s]service', +'bloglines', +'blogpulse', +'blogsearch', +'blogshares', +'blogslive', +'blogssay', +'bncf\.firenze\.sbn\.it\/raccolta\.txt', +'bobby', +'boitho\.com\-dc', +'bookmark\-manager', +'boris', +'bubing', +'bumblebee', +'candlelight[_+\s]favorites[_+\s]inspector', +'careerbot', +'cbn00glebot', +'ccbot', +'cerberian_drtrs', +'cfnetwork', +'cipinetbot', +'checkweb_link_validator', +'cliqzbot', +'commons\-httpclient', +'computer_and_automation_research_institute_crawler', +'converamultimediacrawler', +'converacrawler', +'copubbot', +'cscrawler', +'cse_html_validator_lite_online', +'cuasarbot', +'cursor', +'custo', +'datafountains\/dmoz_downloader', +'dataprovider\.com', +'daumoa', +'daviesbot', +'daypopbot', +'deepindex', +'deusu', +'dipsie\.bot', +'dnsgroup', +'doccheckbot', +'domainchecker', +'domainsdb\.net', +'dotbot', +'duckduckgo-favicons-bot', +'dulance', +'dumbot', +'dumm\.de\-bot', +'earthcom\.info', +'easydl', +'eccp', +'edgeio\-retriever', +'ernst[:blank:]2\.0', +'ets_v', +'exactseek', +'extreme[_+\s]picture[_+\s]finder', +'eventax', +'everbeecrawler', +'everest\-vulcan', +'ezresult', +'enteprise', +'facebook', +'facebot', +'fast_enterprise_crawler.*crawleradmin\.t\-info@telekom\.de', +'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de', +'finderlein[_+\s]research[_+\s]crawler', +'matrix_s\.p\.a\._\-_fast_enterprise_crawler', # must come before fast enterprise crawler +'fast_enterprise_crawler', +'fast\-search\-engine', +'fastbot', +'favicon', +'favorg', +'favorites_sweeper', +'feedburner', +'feedfetcher\-google', +'feedflow', +'feedster', +'feedsky', +'feedvalidator', +'fetchbot', +'filmkamerabot', +'filterdb\.iss\.net', +'findlinks', +'findexa_crawler', +'firmilybot', +'foaf-search\.net', +'fooky\.com\/ScorpionBot', +'g2crawler', +'gaisbot', +'geniebot', +'genieo', +'gigablastopensource', +'gigabot', +'girafabot', +'global_fetch', +'gnodspider', +'goforit\.com', +'goforitbot', +'gonzo', +'grapeshot', +'grub', +'gpu_p2p_crawler', +'henrythemiragorobot', +'heritrix', +'holmes', +'hoowwwer', +'hpprint', +'htmlparser', +'html[_+\s]link[_+\s]validator', +'httrack', +'hundesuche\.com\-bot', +'i-bot', +'icarus6j', +'ichiro', +'idmarch', +'iltrovatore\-setaccio', +'implisensebot', +'infobot', +'infociousbot', +'infohelfer', +'infomine', +'insurancobot', +'integromedb\.org', +'internet[_+\s]ninja', +'internetarchive', +'internetseer', +'internetsupervision', +'ips\-agent', +'irlbot', +'isearch2006', +'istellabot', +'iupui_research_bot', +'izsearch', +'james\sbot', +'jobboerse', #AWStats seems not to find this one despite the fact that "JobboerseBot" and "jobboerse.com" appear in the UA-string, maybe some previous entry matches +'jrtwine[_+\s]software[_+\s]check[_+\s]favorites[_+\s]utility', +'justview', +'kalambot', +'kamano\.de_newsfeedverzeichnis', +'kazoombot', +'kevin', +'keyoshid', # Must come before Y!J +'kinjabot', +'kinja\-imagebot', +'knowitall', +'knowledge\.com', +'kouaa_krawler', +'krugle', +'ksibot', +'kurzor', +'lanshanbot', +'letscrawl\.com', +'libcrawl', +'linkbot', +'linkdex\.com', +'link_valet_online', +'metager\-linkchecker', # Must be before linkchecker +'linkchecker', +'linkstats\sbot', +'lipperhey', +'livejournal\.com', +'lmspider', +'loadtimebot', +'lssrocketcrawler', +'ltbot', +'ltx71', +'lwp\-request', +'lwp\-trivial', +'madaali\.de', +'magpierss', +'mail\.ru', +'mapoftheinternet\.com', +'meanpathbot', +'mediabot', +'mediapartners\-google', +'megaindex', +'megite', +'memorybot', +'metager2-verification-bot', +'metajobbot', #Does not show up in the results of Sep. 2015 despite the fact that the corresponing log file has about 40 entries containing "MetaJobBot" in the UA string - strange. +'metaspinner', +'miadev', +'microsoft\sbits', +'microsoft.*discovery', # = 'microsoft (?:office (?:protocol|existence)|data access internet publishing provider protocol) discovery', +'microsoft[_+\s]url[_+\s]control', +'mindupbot', +'mini\-reptile', +'minirank', +'missigua_locator', +'misterbot', +'miva', +'mizzu_labs', +'mj12bot', +'mojeekbot', +'msiecrawler', +'ms[_+\s]search[_+\s]6\.0[_+\s]robot', +'ms_search_4\.0_robot', +'msrabot', +'msrbot', +'mt::telegraph::agent', +'mydoyouhike', +'nagios', +'nasa_search', +'netestate\sne\scrawler', +'netluchs', +'netsprint', +'newsgatoronline', +'nicebot', +'nimblecrawler', +'noxtrumbot', +'npbot', +'loocalcrawler/nutch', +'nutchcvs', +'nutchosu\-vlib', +'nutch', # Must come after other nutch versions +'ocelli', +'octora_beta_bot', +'omniexplorer[_+\s]bot', +'onet\.pl[_+\s]sa', +'onfolio', +'opentaggerbot', +'openwebspider', +'optimizer', +'oracle_ultra_search', +'orangebot', +'orbiter', +'yodaobot', +'qihoobot', +'qwantify', +'passwordmaker\.org', +'pear_http_request_class', +'peerbot', +'perman', +'php[_+\s]version[_+\s]tracker', +'phpcrawl', +'picmole', +'pictureofinternet', +'ping\.blo\.gs', +'plinki', +'pluckfeedcrawler', +'plukkie', +'pogodak', +'pompos', +'popdexter', +'port_huron_labs', +'postfavorites', +'projectwf\-java\-test\-crawler', +'proodlebot', +'publiclibraryarchive', +'pyquery', +'rambler', +'redalert', +'riddler', +'rogerbot', +'rojo', +'rssimagesbot', +'ruffle', +'rufusbot', +'safeads\.xyz', +'safesearch', +'sandcrawler', +'savetheworldheritage', +'sbider', +'schizozilla', +'scumbot', +'searchguild[_+\s]dmoz[_+\s]experiment', +'searchmetricsbot', +'seekbot', +'semrushbot', +'sensis_web_crawler', +'seodiver', +'seokicks\.de', +'seoscanners', +'seznambot', +'shim\-crawler', +'shoutcast', +'sitedomain-bot', +'siteexplorer\.info', +'skimbot', +'slysearch', +'smtbot', +'snap\.com_beta_crawler', +'sohu\-search', +'sohu', # "sohu agent" +'snappy', +'spbot', +'sphere_scout', +'spiderlytics', +'spip', +'sproose_crawler', +'ssearch_bot', +'steeler', +'steroid__download', +'stq_bot', +'suchfin\-bot', +'superbot', +'surveybot', +'susie', +'syndic8', +'syndicapi', +'synoobot', +'tcl_http_client_package', +'technoratibot', +'teragramcrawlersurf', +'test_crawler', +'testbot', +'thumbsniper', +'t\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e', +'topicblogs', +'turnitinbot', +'turtlescanner', # Must be before turtle +'turtle', +'tutorgigbot', +'twiceler', +'ubicrawler', +'ultraseek', +'unchaos_bot_hybrid_web_search_engine', +'unido\-bot', +'unisterbot', +'updated', +'ustc\-semantic\-group', +'vagabondo\-wap', +'vagabondo', +'vebidoobot', +'vermut', +'versus_crawler_from_eda\.baykan@epfl\.ch', +'vespa_crawler', +'voltron', +'vortex', +'vse\/', +'w3c\-checklink', +'w3c[_+\s]css[_+\s]validator[_+\s]jfouffa', +'w3c_validator', +'watchmouse', +'wavefire', +'waybackarchive\.org', +'wbsearchbot', +'webclipping\.com', +'webcompass', +'webcrawl\.net', +'web_downloader', +'webdup', +'webfilter', +'webindexer', +'webminer', +'website[_+\s]monitoring[_+\s]bot', +'webvulncrawl', +'wells_search', +'wer-liefert-was', +'wesee:search', +'wevikabot', +'wonderer', +'wotbox', +'wume_crawler', +'wwweasel', +'xenu\'s_link_sleuth', +'xenu_link_sleuth', +'xirq', +'xovibot', +'y!j', # Must come after keyoshid Y!J +'yacy', +'yahoo\-blogs', +'yahoo\-verticalcrawler', +'yahoofeedseeker', +'yahooseeker\-testing', +'yahooseeker', +'yahoo\-mmcrawler', +'yahoo!_mindset', +'yandex', +'flexum', +'yanga', +'yet-another-spider', +'yisouspider', +'yooglifetchagent', +'z\-add_link_checker', +'zealbot', +'zhuaxia', +'zspider', +'zeus', +'ng\/1\.', # put at end to avoid false positive +'ng\/2\.', # put at end to avoid false positive +'exabot', # put at end to avoid false positive +# Additional bots found by Sussex. +'^[1-3]$', # Hiding bots. Doesn't appear to be a valid user agent. +'alltop', +'applesyndication', +'asynchttpclient', +'bingbot', +'blogged_crawl', +'bloglovin', +'butterfly', +'buzztracker', +'carpathia', +'catbot', +'chattertrap', +'check_http', #(nagios) a monitoring tool +'coldfusion', +'covario', +'daylifefeedfetcher', +'discobot', +'dlvr\.it', +'dreamwidth', +'drupal', +'ezoom', +'feedmyinbox', +'feedroll\.com', +'feedzira', +'fever\/', +'freenews', +'geohasher', +'hanrss', +'inagist', +'jacobin\sclub', +'jakarta', +'js\-kit', +'largesmall\scrawler', +'linkedinbot', +'longurl', +'metauri', +'microsoft\-webdav\-miniredir', +'^motorola$', +'movabletype', +# These appear to be bots trying to hide. All of the usual architecture data is missing. +'^mozilla\/3\.0\s\(compatible$', +'^mozilla\/4\.0$', +'^mozilla\/4\.0\s\(compatible;\)$', +'^mozilla\/5\.0$', +'^mozilla\/5\.0\s\(compatible;$', +'^mozilla\/5\.0\s\(en\-us\)$', +'^mozilla\/5\.0\sfirefox\/3\.0\.5$', +'^msie', +# End of hiding bots. +'netnewswire', +'\snetseer\s', +'netvibes', +'newrelicpinger', +'newsfox', +'nextgensearchbot', +'ning', +'pingdom', +'pita', +'postpost', +'postrank', +'printfulbot', +'protopage', +'proximic', +'quipply', +'r6\_', +'ratingburner', +'regator', +'rome\sclient', +'rpt\-httpclient', +'rssgraffiti', +'sage\+\+', +'scoutjet', +'simplepie', +'sitebot', +'summify\.com', +'superfeedr', +'synthesio', +'teoma', +'topblogsinfo', +'topix\.net', +'trapit', +'trileet', +'tweetedtimes', +'twisted\spagegetter', +'twitterbot', +'twitterfeed', +'unwindfetchor', +'wazzup', +'windows\-rss\-platform', +'wiumi', +'xydo', +'yahoo!\sslurp', +'yahoo\spipes', +'yahoo\-newscrawler', +'yahoocachesystem', +'yahooexternalcache', +'yahoo!\ssearchmonkey', +'yahooysmcm', +'yammer', +# 'yandexbot', #already covered by 'yandex' +'yeti', +'yie8', +'youdao', +'yourls', +'zemanta', +'zend_http_client', +'zumbot', +# Other id that are 99% of robots +'wget', +'libwww', +'^java\/[0-9]' # put at end to avoid false positive +); +@RobotsSearchIDOrder_listgen = ( +# Generic robot +'robot', +'checker', +'crawl', +'discovery', +'hunter', +'scanner', +'spider', +'sucker', +'bot[\s_+:,\.\;\/\\\-]', +# Identifies +#"Mozilla/5.0 (Linux; U; Android 4.2.2; de-de; CUBOT P9 Build/JDQ39) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30" +#as a but. There is a Android mobile phone called "CUBOT P9", so this is probably not a bot. +'[\s_+:,\.\;\/\\\-]bot', +'curl', +'php', +'ruby\/', +'no_user_agent' +); + + + +# RobotsHashIDLib +# List of robots names ('robot id','robot clear text') +#------------------------------------------------------- +%RobotsHashIDLib = ( +# Common robots (In robot file) +'appie','Walhello appie', +'architext','ArchitextSpider', +'bingpreview','Bing Preview bot', +'bjaaland','Bjaaland', +'ferret','Wild Ferret Web Hopper #1, #2, #3', +'contentmatch','Yahoo!China ContentMatch Crawler', +'googlebot\-image','Googlebot-Image', +'googlebot','Googlebot', +'google\-sitemaps', 'Google Sitemaps', +'grabber', 'Grabber (SDSC)', +'google[_+\s]web[_+\s]preview', 'Google Web Preview', +'gulliver','Northern Light Gulliver', +'virus[_+\s]detector','virus_detector', +'harvest','Harvest', +'htdig','ht://Dig', +'jeeves','Ask', +'linkwalker','LinkWalker', +'lilina','Lilina', +'lycos[_+\s]','Lycos', +'moget','moget', +'muscatferret','Muscat Ferret', +'myweb','Internet Shinchakubin', +'nomad','Nomad', +'scooter','Scooter', +'slurp','Yahoo Slurp', +'^voyager\/','Voyager', +'weblayers','Weblayers', +# Common robots (Not in robot file) +'antibot','Antibot', +'bruinbot','The web archive', +'digout4u','Digout4u', +'echo!','EchO!', +'fast\-webcrawler','Fast-Webcrawler', +'ia_archiver\-web\.archive\.org','The web archive (IA Archiver)', +'ia_archiver','Alexa (IA Archiver)', +'jennybot','JennyBot', +'mercator','Mercator', +'msnbot\-media','MSNBot-media', +'msnbot-udiscovery', 'msnbot-UDiscovery Note: AWStats counts most of its traffic as user traffic', +'msnbot','MSNBot', +'netcraft','Netcraft', +'petersnews','Petersnews', +'unlost_web_crawler','Unlost Web Crawler', +'voila','Voila', +'webbase', 'WebBase', +'zyborg','ZyBorg', +'wisenutbot','WISENutbot', +'webcollage','WebCollage', +'cfetch','Cfetch', +# Less common robots (In robot file) +'007ac9', '007ac9 Crawler, seems to belong to SISTRIX', +'[^a]fish','Fish search', +'abcdatos','ABCdatos BotLink', +'abonti\.com','Abonti WebSearch', +'acme\.spider','Acme.Spider', +'ahoythehomepagefinder','Ahoy! The Homepage Finder', +'ahrefsbot', 'AhrefsBot', +'alkaline','Alkaline', +'anthill','Anthill', +'arachnophilia','Arachnophilia', +'arale','Arale', +'araneo','Araneo', +'aretha','Aretha', +'ariadne','ARIADNE', +'powermarks','Powermarks', # must come before Arks; seen used by referrer spam +'arks','arks', +'aspider','ASpider (Associative Spider)', +'atn\.txt','ATN Worldwide', +'atomz','Atomz.com Search Robot', +'auresys','AURESYS', +'backrub','BackRub', +'bbot','BBot', +'bigbrother','Big Brother', +'blackwidow','BlackWidow', +'blindekuh','Die Blinde Kuh', +'bloodhound','Bloodhound', +'borg\-bot','Borg-Bot', +'brightnet','bright.net caching robot', +'bspider','BSpider', +'cactvschemistryspider','CACTVS Chemistry Spider', +'calif[^r]','Calif', +'cassandra','Cassandra', +'cgireader','Digimarc Marcspider/CGI', +'checkbot','Checkbot', +'christcrawler','ChristCrawler.com', +'churl','churl', +'cienciaficcion','cIeNcIaFiCcIoN.nEt', +'cms\scrawler', 'CMS Crawler', +'collective','Collective', +'combine','Combine System', +'conceptbot','Conceptbot', +'coolbot','CoolBot', +'core','Web Core / Roots', +'cosmos','XYLEME Robot', +'crazywebcrawler', 'CrazyWeb Crawler', +'cruiser','Internet Cruiser Robot', +'cusco','Cusco', +'cyberspyder','CyberSpyder Link Test', +'desertrealm','Desert Realm Spider', +'deweb','DeWeb(c) Katalog/Index', +'dienstspider','DienstSpider', +'digger','Digger', +'diibot','Digital Integrity Robot', +'direct_hit','Direct Hit Grabber', +'dnabot','DNAbot', +'domainappender', 'DomainAppender', +'download_express','DownLoad Express', +'dragonbot','DragonBot', +'dwcp','DWCP (Dridus\' Web Cataloging Project)', +'e\-collector','e-collector', +'ebiness','EbiNess', +'elfinbot','ELFINBOT', +'emacs','Emacs-w3 Search Engine', +'emcspider','ananzi', +'esther','Esther', +'evliyacelebi','Evliya Celebi', +'fastcrawler','FastCrawler', +'feedcrawl','FeedCrawl by feed@aobo.com', +'fdse','Fluid Dynamics Search Engine robot', +'felix','Felix IDE', +'fetchrover','FetchRover', +'fido','fido', +'finnish','Finnish', +'fireball','KIT-Fireball', +'fouineur','Fouineur', +'francoroute','Robot Francoroute', +'freecrawl','Freecrawl', +'funnelweb','FunnelWeb', +'gama','gammaSpider, FocusedCrawler', +'gazz','gazz', +'gcreep','GCreep', +'getbot','GetBot', +'geturl','GetURL', +'golem','Golem', +'gougou','GouGou', +'grapnel','Grapnel/0.01 Experiment', +'griffon','Griffon', +'gromit','Gromit', +'gulperbot','Gulper Bot', +'hambot','HamBot', +'havindex','havIndex', +'hometown','Hometown Spider Pro', +'htmlgobble','HTMLgobble', +'hyperdecontextualizer','Hyper-Decontextualizer', +'iajabot','iajaBot', +'iaskspider','Sina Iask Spider', +'hl_ftien_spider','Hylanda', +'sogou','Sogou Spider', +'icjobs\.de', 'iCjobs Spider Note: Most traffic counts as user traffic', +#20130805 The user agent string of the icjobs-spider contained the +#identifying string only when it accessed the robots.txt file. +#When it accessed the actual content it did not identify itself as +#a spider. Thus traffic of this spider was counted as user traffic. +#The behavious seems to have changed now - the spider identifies itself +#when it accesses content pages. +#20141401 Behavior as before: Does identify itself when it accesses +# robots.txt and the root page. The following traffic does not contain +# the identification string and is therefore counted as user traffic. +'iconoclast','Popular Iconoclast', +'ilse','Ingrid', +'imagelock','Imagelock', +'incywincy','IncyWincy', +'informant','Informant', +'infoseek','InfoSeek Robot 1.0', +'infoseeksidewinder','Infoseek Sidewinder', +'infospider','InfoSpiders', +'inspectorwww','Inspector Web', +'intelliagent','IntelliAgent', +'ips\-agent', 'ips-agent Verisign(?) - no reliable information found.', +'irobot','I, Robot', +'iron33','Iron33', +'israelisearch','Israeli-search', +'javabee','JavaBee', +'jbot','JBot Java Web Robot', +'jcrawler','JCrawler', +'jobo','JoBo Java Web Robot', +'jobot','Jobot', +'joebot','JoeBot', +'jubii','The Jubii Indexing Robot', +'jumpstation','JumpStation', +'kapsi','image.kapsi.net', +'katipo','Katipo', +'kilroy','Kilroy', +'ko[_+\s]yappo[_+\s]robot','KO_Yappo_Robot', +'kummhttp','KummHttp', +'labelgrabber\.txt','LabelGrabber', +'larbin','larbin', +'legs','legs', +'linkidator','Link Validator', +'linkscan','LinkScan', +'lockon','Lockon', +'logo_gif','logo.gif Crawler', +'macworm','Mac WWWWorm', +'lmspider','lmspider', +'lwp\-request','lwp-request', +'lwp\-trivial','lwp-trivial', +'magpie','MagpieRSS', +'marvin','marvin/infoseek', +'mattie','Mattie', +'mediafox','MediaFox', +'merzscope','MerzScope', +'meshexplorer','NEC-MeshExplorer', +'mindcrawler','MindCrawler', +'mnogosearch','mnoGoSearch search engine software', +'momspider','MOMspider', +'monster','Monster', +'motor','Motor', +'muncher','Muncher', +'mwdsearch','Mwd.Search', +'ndspider','NDSpider', +'nederland\.zoek','Nederland.zoek', +'netcarta','NetCarta WebMap Engine', +'netmechanic','NetMechanic', +'netscoop','NetScoop', +'newscan\-online','newscan-online', +'nhse','NHSE Web Forager', +'northstar','The NorthStar Robot', +'nzexplorer','nzexplorer', +'objectssearch','ObjectsSearch', +'occam','Occam', +'octopus','HKU WWW Octopus', +'openfind','Openfind data gatherer', +'orb_search','Orb Search', +'packrat','Pack Rat', +'pageboy','PageBoy', +'parasite','ParaSite', +'patric','Patric', +'pegasus','pegasus', +'perignator','The Peregrinator', +'perlcrawler','PerlCrawler 1.0', +'phantom','Phantom', +'phpdig','PhpDig', +'piltdownman','PiltdownMan', +'pimptrain','Pimptrain.com\'s robot', +'pioneer','Pioneer', +'pitkow','html_analyzer', +'pjspider','Portal Juice Spider', +'plumtreewebaccessor','PlumtreeWebAccessor', +'poppi','Poppi', +'portalb','PortalB Spider', +'psbot','psbot', +'python','Python-urllib', +'raven','Raven Search', +'rbse','RBSE Spider', +'resumerobot','Resume Robot', +'rhcs','RoadHouse Crawling System', +'road_runner','Road Runner: The ImageScape Robot', +'robbie','Robbie the Robot', +'robi','ComputingSite Robi/1.0', +'robocrawl','RoboCrawl Spider', +'robofox','RoboFox', +'robozilla','Robozilla', +'roverbot','Roverbot', +'rules','RuLeS', +'safetynetrobot','SafetyNet Robot', +'semalt', 'seamalt.com', +'search\-info','Sleek', +'search_au','Search.Aus-AU.COM', +'searchprocess','SearchProcess', +'senrigan','Senrigan', +'sgscout','SG-Scout', +'shaggy','ShagSeeker', +'shaihulud','Shai\'Hulud', +'sift','Sift', +'simbot','Simmany Robot Ver1.0', +'sistrix', 'SISTRIX Crawler', +'site\-valet','Site Valet', +'sitetech','SiteTech-Rover', +'skymob','Skymob.com', +'slcrawler','SLCrawler', +'smartspider','Smart Spider', +'snooper','Snooper', +'solbot','Solbot', +'speedy','Speedy Spider', +'spider[_+\s]monkey','Spider monkey', +'spiderbot','SpiderBot', +'spiderline','Spiderline Crawler', +'spiderlytics', 'Spiderlytics: No homepage, e-mail only: spider (at) spiderlytics.com', +'spiderman','Spiderman', +'spiderview','SpiderView(tm)', +'spry','Spry Wizard Robot', +'ssearcher','Site Searcher', +'sqworm','Sqworm', +'suke','Suke', +'sunrise','Sunrise', +'suntek','suntek search engine', +'sven','Sven', +'tach_bw','TACH Black Widow', +'tagyu_agent','Tagyu Agent', +'tarantula','Tarantula', +'tarspider','tarspider', +'tailrank','TailRank', +'techbot','TechBOT', +'templeton','Templeton', +'titan','TITAN', +'titin','TitIn', +'tkwww','The TkWWW Robot', +'tlspider','TLSpider', +'ucsd','UCSD Crawl', +'udmsearch','UdmSearch', +'universalfeedparser','UniversalFeedParser', +'urlck','URL Check', +'valkyrie','Valkyrie', +'verticrawl','Verticrawl', +'victoria','Victoria', +'visionsearch','vision-search', +'voidbot','void-bot', +'vwbot','VWbot', +'w3index','The NWI Robot', +'w3m2','W3M2', +'wallpaper','WallPaper (alias crawlpaper)', +'wanderer','the World Wide Web Wanderer', +'wapspider','w@pSpider by wap4.com', +'webbandit','WebBandit Web Spider', +'webcatcher','WebCatcher', +'webcopy','WebCopy', +'webfetcher','webfetcher', +'webfoot','The Webfoot Robot', +'webinator','Webinator', +'weblinker','WebLinker', +'webmirror','WebMirror', +'webmoose','The Web Moose', +'webquest','WebQuest', +'webreader','Digimarc MarcSpider', +'webreaper','WebReaper', +'websnarf','Websnarf', +'webspider','WebSpider', +'webvac','WebVac', +'webwalk','webwalk', +'webwalker','WebWalker', +'webwatch','WebWatch', +'whatuseek','whatUseek Winona', +'whowhere','WhoWhere Robot', +'wired\-digital','Wired Digital', +'wmir','w3mir', +'wolp','WebStolperer', +'wombat','The Web Wombat', +'wordpress','WordPress', +'worm','The World Wide Web Worm', +'woozweb','Woozweb Monitoring', +'wwwc','WWWC Ver 0.2.5', +'wz101','WebZinger', +'xenu\slink\ssleuth', 'Xenu'. "'" . 's Link Sleuth (TM), see Wikipedia', +'xget','XGET', +# Other robots reported by users +'^finbot', 'finbot', +'^webindex$', 'WebIndex', +'1\-more_scanner','1-More Scanner', +'360spider','360spider', +'a6-indexer', 'A6-Indexer', +'accoona\-ai\-agent','Accoona-AI-Agent', +'activebookmark','ActiveBookmark', +'adamm_bot','AdamM Bot', +'adsbot-google', 'AdsBot-Google', +'advbot', 'AdvBot', +'affectv\.co\.uk', 'affectv.co.uk', +'almaden','IBM Almaden Research Center WebFountain™', +'aipbot','aipbot', +'aleadsoftbot','ALeadSoftbot', +'alpha_search_agent','Alpha Search Agent', +'allrati','Allrati', +'aport', 'Aport', +'applebot', 'Applebot', +'archive\-de\.com', 'Archive-de.com', +'archive\.org_bot','archive.org bot', +'argus','Argus', +'arianna\.libero\.it','arianna.libero.it', +'aspseek','ASPseek', +'asterias', 'Asterias', +'awbot', 'AWBot', +'backlinktest\.com', 'BacklinkCrawler', +'baiduspider','BaiDuSpider', +'becomebot', 'BecomeBot', +'bender','bender focused_crawler', +'betabot','BetaBot', +'biglotron','Biglotron', +'bittorrent_bot','BitTorrent Bot', +'biz360[_+\s]spider','Biz360 spider', +'blexbot', 'BLEXBot, seems to belong to the WebMeUp backlink tool', +'blogbridge[_+\s]service','BlogBridge Service', +'bloglines','Bloglines', +'blogpulse','BlogPulse ISSpider intelliseek.com', +'blogsearch','BlogSearch', +'blogshares','Blogshares Spiders', +'blogslive','Blogslive', +'blogssay','BlogsSay :: RSS Search Crawler', +'bncf\.firenze\.sbn\.it\/raccolta\.txt','Biblioteca Nazionale Centrale di Firenze', +'bobby', 'Bobby', +'boitho\.com\-dc','boitho.com-dc', +'bookmark\-manager','Bookmark-Manager', +'boris', 'Boris', +'bubing', 'BUbiNG', +'bumblebee', 'Bumblebee (relevare.com)', +'candlelight[_+\s]favorites[_+\s]inspector','Candlelight_Favorites_Inspector', +'careerbot', 'CareerBot', +'cbn00glebot','cbn00glebot', +'ccbot', 'Common Crawl', +'cerberian_drtrs','Cerberian Drtrs', +'cfnetwork','CFNetwork', +'cipinetbot','CipinetBot', +'checkweb_link_validator','CheckWeb link validator', +'cliqzbot', 'Cliqzbot', +'commons\-httpclient','Jakarta commons-httpclient', +'computer_and_automation_research_institute_crawler','Computer and Automation Research Institute Crawler', +'converamultimediacrawler','ConveraMultiMediaCrawler', +'converacrawler','ConveraCrawler', +'copubbot', 'CoPubbot', +'cscrawler','CsCrawler', +'cse_html_validator_lite_online','CSE HTML Validator Lite Online','cuasarbot','Cuasarbot', +'cursor','Cursor', +'custo','Custo', +'datafountains\/dmoz_downloader','DataFountains/DMOZ Downloader', +'dataprovider\.com', 'Dataprovider Site Explorer', +'daumoa', 'Daum', +'daviesbot', 'DaviesBot', +'daypopbot', 'DayPop', +'deepindex','Deepindex', +'deusu', 'DeuSu', +'dipsie\.bot','Dipsie', +'dnsgroup','DNSGroup', +'doccheckbot', 'doccheckbot/1.0, known to Project Honey Pot', +'domainchecker','DomainChecker', +'domainsdb\.net','DomainsDB.net', +'dotbot', 'DotBot, Open Site Explorer', +'duckduckgo-favicons-bot', 'DuckDuckGo-Favicons-Bot', +'dulance','Dulance', +'dumbot','Dumbot', +'dumm\.de\-bot','dumm.de-Bot', +'earthcom\.info','EARTHCOM.info', +'easydl','EasyDL', +'eccp', 'Eniro Sverige, email: search (at) eniro.com', +'edgeio\-retriever','edgeio-retriever', +'ernst[:blank:]2\.0', 'Ernst 2.0 (does not provide any further information)', +'ets_v','ETS Enterprise Translation Server', +'exactseek','ExactSeek Crawler', +'extreme[_+\s]picture[_+\s]finder','Extreme_Picture_Finder', +'eventax','eventax', +'everbeecrawler','EverbeeCrawler', +'everest\-vulcan','Everest-Vulcan', +'ezresult', 'Ezresult', +'enteprise','Fast Enteprise Crawler', +'facebook','FaceBook bot', +'facebot', 'Facebot (Facebook bot?)', +'fast\-search\-engine','Fast-Search-Engine (not fastsearch.com)', +'fast_enterprise_crawler','FAST Enterprise Crawler', +'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de','FAST Enterprise Crawler * crawleradmin.t-info@telekom.de', +'finderlein[_+\s]research[_+\s]crawler', 'Finderlein Research Crawler 1.0 (no contact information given)', +'matrix_s\.p\.a\._\-_fast_enterprise_crawler','Matrix S.p.A. - FAST Enterprise Crawler', +'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de','FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de', +'fastbot', 'fastbot', +'favicon','FavIconizer', +'favorg','FavOrg', +'favorites_sweeper','Favorites Sweeper', +'feedburner', 'Feedburner', +'feedfetcher\-google','Feedfetcher-Google', +'feedflow','FeedFlow', +'feedster','Feedster', +'feedsky','FeedSky', +'feedvalidator','FeedValidator', +'fetchbot', 'Fetchbot', +'filmkamerabot','FilmkameraBot', +'filterdb\.iss\.net', 'oBot', +'findexa_crawler','Findexa Crawler', +'firmilybot', 'Firmily Bot Home page (Website was hacked on Oct. 19, 2013)', +'findlinks','Findlinks', +'foaf-search\.net', 'Friend of a friend (FOAF) search engine', +'fooky\.com\/ScorpionBot','Fooky.com/ScorpionBot/ScoutOut', +'g2crawler','G2Crawler', +'gaisbot','Gaisbot', +'geniebot','Geniebot', +'genieo', 'Genieo', +'gigablastopensource', 'GigablastOpenSource, an Open Source Search Engine(Wiki)', +'gigabot','GigaBot', +'girafabot','Girafabot', +'global_fetch','Global Fetch', +'gnodspider','GNOD Spider', +'goforit\.com','GoForIt.com', +'goforitbot','GOFORITBOT', +'gonzo','suchen.de', +'gpu_p2p_crawler','GPU p2p crawler', +'grapeshot', 'Grapeshot Crawler', +'grub','Grub.org', +'henrythemiragorobot', 'Mirago', +'heritrix','Heritrix', +'holmes', 'Holmes', +'hoowwwer','HooWWWer', +'hpprint','HPPrint', +'htmlparser','HTMLParser', +'html[_+\s]link[_+\s]validator','Html_Link_Validator', +'httrack','HTTrack off-line browser', +'hundesuche\.com\-bot','Hundesuche.com-Bot', +'i-bot','i-bot', +'icarus6j', 'Icarus6j, email address in UA string, no website', +'ichiro','ichiro', +'idmarch', 'IDMARCH', +'iltrovatore\-setaccio','IlTrovatore-Setaccio', +'implisensebot', 'ImplisenseBot', +'infobot','InfoBot', +'infociousbot','InfociousBot', +'infohelfer','Infohelfer', +'infomine','INFOMINE VLCrawler', +'insurancobot','InsurancoBot', +'integromedb\.org','IntegromeDB', +'internet[_+\s]ninja','Internet_Ninja ', +'internetarchive','InternetArchive', +'internetseer', 'InternetSeer', +'internetsupervision','InternetSupervision', +'irlbot','IRLbot', +'isearch2006','isearch2006', +'istellabot', 'IstellaBot', +'iupui_research_bot','IUPUI_Research_Bot', +'izsearch', 'iZSearch', +'james\sbot', 'James BOT', +'jobboerse', 'Jobbörse', +'jrtwine[_+\s]software[_+\s]check[_+\s]favorites[_+\s]utility','JRTwine_Software_Check_Favorites_Utility', +'justview', 'JustView', +'kalambot','KalamBot', +'kamano\.de_newsfeedverzeichnis','kamano.de NewsFeedVerzeichnis', +'kazoombot','KazoomBot', +'kevin','Kevin', +'keyoshid','Yahoo! Japan keyoshid robot study', +'kinjabot', 'Kinjabot', +'kinja\-imagebot', 'Kinja Imagebot', +'knowitall','KnowItAll', +'knowledge\.com','Knowledge.com', +'kouaa_krawler','Kouaa Krawler', +'krugle','Krugle', +'ksibot','ksibot', +'kurzor','Kurzor', +'lanshanbot','lanshanbot', +'letscrawl\.com','LetsCrawl.com', +'libcrawl','Crawl libcrawl', +'link_valet_online','Link Valet Online', +'linkbot','LinkBot', +'linkdex\.com', 'Linkdex', +'linkchecker','LinkChecker', +'linkstats\sbot', 'LinkStats Bot', +'lipperhey', 'Lipperhey SEO Service', +'livejournal\.com', 'LiveJournal.com', +'loadtimebot', 'LoadTimeBot', +'lssrocketcrawler', 'LSSRocketCrawler (no contact information)', +'ltbot', 'Language Tools Bot (ltbot)', +'ltx71', 'ltx71', +'madaali\.de', 'www.madaali.de', +'magpierss', 'MagpieRSS', +'mail\.ru', 'Mail.ru bot', +'mapoftheinternet\.com','MapoftheInternet.com', +'meanpathbot', 'Meanpathbot', +'mediabot', 'MediaBot', +'mediapartners\-google','Google AdSense', +# 'Mediapartners-Google (Feb 12, 2015: no additial information in UA String, seems to use GigablastOpenSource', +# Uses UA string "Mediapartners-Google" only, and there were accesses using an UA string "GigablastOpenSource/1.0" from the same IP-Address. +# Therefore this is probably not related to Google 4.3.2015 Albrecht M眉ller +'megaindex', 'MegaIndex Crawler, seems to belong to MegaIndex.ru', +'megite','Megite', +'memorybot', 'Archivethe.net', +'metager2-verification-bot', 'metager2-verification-bot', +'metager\-linkchecker','MetaGer LinkChecker', +'metajobbot', 'MetaJobBot', +'metaspinner','Metaspinner', +'miadev', 'MiaDev spider', +'microsoft\sbits', 'Microsoft Background Intelligent Transfer Service (BITS)?', +'microsoft.*discovery', 'Microsoft Office Protocol Discovery/Microsoft Office Existence Discovery', +'microsoft[_+\s]url[_+\s]control','Microsoft URL Control', +'mindupbot', 'mindUpBot (datenbutler.de)', +'minirank','miniRank', +'mini\-reptile','Mini-reptile', +'missigua_locator','Missigua_Locator', +'misterbot','Misterbot', +'miva','Miva', +'mizzu_labs','Mizzu Labs', +'mj12bot','MJ12bot', +'mojeekbot','MojeekBot', +'msiecrawler','MSIECrawler', +'ms[_+\s]search[_+\s]6\.0[_+\s]robot','MS Search 6.0 Robot (MS SharePoint Portal Server?)', +'ms_search_4\.0_robot','MS SharePoint Portal Server - MS Search 4.0 Robot', +'msrabot','msrabot', +'msrbot','MSRBOT', +'mt::telegraph::agent','MT::Telegraph::Agent', +'mydoyouhike','Mydoyouhike', +'nagios','Nagios', +'nasa_search','NASA Search', +'netestate\sne\scrawler','Website-Datenbank', +'netluchs','Netluchs', +'netsprint','NetSprint', +'newsgatoronline', 'NewsGator Online', +'nicebot','nicebot', +'nimblecrawler','NimbleCrawler', +'noxtrumbot','noxtrumbot', +'npbot','NPBot', +'loocalcrawler/nutch', 'LoocalCrawler/Nutch', +'nutchcvs','NutchCVS', +'nutchosu\-vlib','NutchOSU-VLIB', +'nutch','Nutch', +'ocelli','Ocelli', +'octora_beta_bot','Octora Beta Bot', +'omniexplorer[_+\s]bot','OmniExplorer Bot', +'onet\.pl[_+\s]sa','Onet.pl_SA', +'onfolio','Onfolio', +'opentaggerbot','OpenTaggerBot', +'openwebspider','OpenWebSpider', +'optimizer', 'Optimizer', +'oracle_ultra_search','Oracle Ultra Search', +'orangebot', 'OrangeBot, no website, log entry specifies mail address', # support.orangebot@orange.com +'orbiter','Orbiter', +'yodaobot','OutfoxBot/YodaoBot', +'qihoobot','QihooBot', +'qwantify', 'Qwant', +'passwordmaker\.org','passwordmaker.org', +'pear_http_request_class','PEAR HTTP Request class', +'peerbot','PEERbot', +'perman', 'Perman surfer', +'php[_+\s]version[_+\s]tracker','PHP version tracker', +'phpcrawl', 'PHPCrawl', +'picmole', 'Specified address www.picmole.com was not reachable on April 21, 2014', +'pictureofinternet','PictureOfInternet', +'ping\.blo\.gs','ping.blo.gs', +'plinki','plinki', +'pluckfeedcrawler','PluckFeedCrawler', +'plukkie', 'Plukkie', +'pogodak','Pogodak.com', +'pompos','Pompos', +'popdexter','Popdexter', +'port_huron_labs','Port Huron Labs', +'postfavorites','PostFavorites', +'projectwf\-java\-test\-crawler','ProjectWF-java-test-crawler', +'proodlebot','proodleBot', +'publiclibraryarchive', 'publiclibraryarchive.org (related to spiderlytics.com and/or waybackarchive.org?)', +#Observations 2014-06-23 +#Domain publiclibraryarchive.org is parked at GoDaddy.com +#from https://www.projecthoneypot.org/ +#81.30.151.220's User Agent Strings (honeypot classified this ip as an mail server, active about 6 years ago) +#Mozilla/5.0 (compatible; publiclibraryarchive.org/1.0; +crawl@publiclibraryarchive.org) +#176.9.138.27's User Agent Strings +#Mozilla/5.0 (compatible; publiclibraryarchive.org/1.0; +crawl@publiclibraryarchive.org) +#Mozilla/5.0 (compatible; Spiderlytics/1.0; +spider@spiderlytics.com) +#Mozilla/5.0 (compatible; waybackarchive.org/1.0; +spider@waybackarchive.org) +#146.0.32.165's User Agent Strings +#Mozilla/5.0 (compatible; publiclibraryarchive.org/1.0; +crawl@publiclibraryarchive.org) +#Mozilla/5.0 (compatible; savetheworldheritage.org/1.0; +crawl@savetheworldheritage.org) +#Mozilla/5.0 (compatible; seoscanners.net/1; +spider@seoscanners.net) +'pyquery','PyQuery', +'rambler','StackRambler', +'redalert','Red Alert', +'relevantnoise\.com', 'Relevant Noise', +'riddler', 'Riddler', +'rogerbot', 'Rogerbot', +'rojo','RoJo aggregator', +'rssimagesbot','rssImagesBot', +'ruffle','ruffle SemanticWeb crawler', +'rufusbot','RufusBot Rufus Web Miner', +'safeads\.xyz', 'SafeAds.xyz', +'safesearch', 'Avira SafeSearch', +'sandcrawler','SandCrawler (Microsoft)', +'savetheworldheritage', 'savetheworldheritage.org (related to spiderlytics.com, waybackarchive.org and/or publiclibraryarchive.org?)', +'sbider','SBIder', +'schizozilla','Schizozilla', +'scumbot','Scumbot', +'searchguild[_+\s]dmoz[_+\s]experiment','SearchGuild_DMOZ_Experiment', +'searchmetricsbot','SearchmetricsBot', +'seekbot','Seekbot', +'semrushbot', 'SemrushBot', +'sensis_web_crawler','Sensis Web Crawler', +'seodiver', 'SEO DIVER', +'seokicks\.de', 'SEOkicks Webcrawler', +'seoscanners', 'seoscanners.net (related to publiclibraryarchive.org and savetheworldheritage.org?)', +'seznambot','SeznamBot', +'shim\-crawler','Shim-Crawler', +'shoutcast','Shoutcast Directory Service', +'sitedomain-bot', 'Sitedomain.de', +'siteexplorer\.info', 'Site Explorer', +'skimbot', 'SkimBot', +'slysearch','SlySearch', +'smtbot', 'SMTBot', +'snap\.com_beta_crawler','snap.com beta crawler', +'sohu\-search','sohu-search', +'sohu','sohu agent', +'snappy','Snappy', +'spbot', 'SEOprofiler Bot', +'sphere_scout','Sphere Scout', +'spip','SPIP', +'sproose_crawler','sproose crawler', +'ssearch_bot', 'sSearch Crawler', +'steroid__download','STEROID Download', +'steeler','Steeler', +'stq_bot', 'SEARCHTEQ', +'suchfin\-bot','Suchfin-Bot', +'superbot','SuperBot', +'surveybot','SurveyBot', +'susie','Susie', +'syndic8','Syndic8', +'syndicapi','SyndicAPI', +'synoobot','SynooBot', +'tcl_http_client_package','Tcl http client package', +'technoratibot', 'Technoratibot', +'teragramcrawlersurf','TeragramCrawlerSURF', +'test_crawler','Test Crawler', +'testbot','TestBot', +'thumbsniper', 'ThumbSniper', +'t\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e','T-H-U-N-D-E-R-S-T-O-N-E', +'topicblogs', 'topicblogs', +'turnitinbot', 'Turn It In', +'turtle', 'Turtle', +'turtlescanner', 'Turtle', +'tutorgigbot','TutorGigBot', +'twiceler','twiceler', +'ubicrawler','UbiCrawler', +'ultraseek', 'Ultraseek', +'unchaos_bot_hybrid_web_search_engine','UnChaos Bot Hybrid Web Search Engine', +'unido\-bot','unido-bot', +'unisterbot', 'UnisterBot; E-Mail only: crawler (at) unister.de', +'updated','updated', +'ustc\-semantic\-group','USTC-Semantic-Group', +'vagabondo\-wap','Vagabondo-WAP', +'vagabondo','Vagabondo', +'vebidoobot', 'vebidoobot', +'vermut','Vermut', +'versus_crawler_from_eda\.baykan@epfl\.ch','versus crawler from eda.baykan@epfl.ch', +'vespa_crawler','Vespa Crawler', +'voltron', 'voltron', +'vortex','VORTEX', +'vse\/','VSE', +'w3c\-checklink','W3C Link Checker', +'w3c[_+\s]css[_+\s]validator[_+\s]jfouffa', 'W3C jigsaw CSS Validator', +'w3c_validator','W3C Validator', +'watchmouse', 'WatchMouse Website Monitor', +'wavefire','Wavefire', +'waybackarchive\.org', 'No website, email: spider(at)waybackarchive.org', +# 2.12.2013 Project Honeypot reports at least one of the IPs used by waybackarchive with a spiderlytics UA string. +# Problably not related to the wayback machine of archive.org. +'wbsearchbot', 'WBSearchBot', +'webclipping\.com', 'WebClipping.com', +'webcompass', 'webcompass', +'webcrawl\.net','webcrawl.net', +'web_downloader','Web Downloader', +'webdup','Webdup', +'webfilter','WebFilter', +'webindexer','WebIndexer', +'webminer','WebMiner', +'website[_+\s]monitoring[_+\s]bot','Website_Monitoring_Bot', +'webvulncrawl', 'WebVulnCrawl', +'wells_search','Wells Search', +'wer-liefert-was', 'Wer-liefert-was Crawler Note: AWStats counts most traffic as user traffic', +'wesee:search', 'WeSEE Bot', +'wevikabot', 'WeViKa', +'wonderer', 'Web Wombat Redback Spider', +'wotbox', 'Wotbox', +'wume_crawler','wume crawler', +'wwweasel',,'WWWeasel', +'xenu\'s_link_sleuth','Xenu Link Sleuth', +'xenu_link_sleuth','Xenu Link Sleuth', +'xirq','xirq', +'xovibot', 'XoviBot', +'y!j', 'Y!J Yahoo Japan', +'yacy', 'YaCy', +'yahoo\-blogs','Yahoo-Blogs', +'yahoo\-verticalcrawler', 'Yahoo Vertical Crawler', +'yahoofeedseeker', 'Yahoo Feed Seeker', +'yahooseeker\-testing', 'YahooSeeker-Testing', +'yahooseeker', 'YahooSeeker Yahoo! Blog crawler', +'yahoo\-mmcrawler', 'Yahoo-MMCrawler', +'yahoo!_mindset','Yahoo! Mindset', +'yandex', 'Yandex Bot', +'flexum', 'Flexum Search Engine', +'yanga', 'Yanga WorldSearch Bot', +'yet-another-spider','Yet-Another-Spider', +'yisouspider', 'YisouSpider (no additional information in UA string)', +'yooglifetchagent','yoogliFetchAgent', +'z\-add_link_checker','Z-Add Link Checker', +'zealbot','ZealBot', +'zhuaxia','ZhuaXia', +'zspider','zspider', +'zeus','Zeus Webster Pro', +'zumbot','ZumBot', +'ng\/1\.','NG 1.x (Exalead)', # put at end to avoid false positive +'ng\/2\.','NG 2.x (Exalead)', # put at end to avoid false positive +'exabot','Exabot', # put at end to avoid false positive +# Other id that are 99% of robots +'wget','WGet tools', +'libwww','Perl tool', +'^java\/[0-9]','Java (Often spam bot)', # put at end to avoid false positive +# Generic robot +'robot', 'Unknown robot (identified by \'robot\')', +'checker', 'Unknown robot (identified by \'checker\')', +'crawl', 'Unknown robot (identified by \'crawl\')', +'discovery', 'Unknown robot (identified by \'discovery\')', +'hunter', 'Unknown robot (identified by \'hunter\')', +'scanner', 'Unknown robot (identified by \'scanner\')', +'spider', 'Unknown robot (identified by \'spider\')', +'sucker', 'Unknown robot (identified by \'sucker\')', +'bot[\s_+:,\.\;\/\\\-]', 'Unknown robot (identified by \'bot\' followed by a space or one of the following characters _+:,.;/\-)', +'[\s_+:,\.\;\/\\\-]bot', 'Unknown robot (identified by a space or one of the characters _+:,.;/\- followed by \'bot\')', +'curl', 'Common *nix tool for automating web document retrieval. Most likely a bot.', +'php', 'A PHP script', +'ruby\/', 'Ruby script', +# Additional bots found by Sussex. +'^[1-3]$', 'Generic bot identified as "1", "2" or "3"', +'alltop', 'alltop', +'applesyndication', 'applesyndication', +'asynchttpclient', 'asynchttpclient', +'bingbot', 'Bingbot', +'blogged_crawl', 'blogged_crawl', +'bloglovin', 'bloglovin', +'butterfly', 'butterfly', +'buzztracker', 'buzztracker', +'carpathia', 'carpathia', +'catbot', 'catbot', +'chattertrap', 'chattertrap', +'check_http', 'check_http (nagios)', +'coldfusion', 'coldfusion', +'covario', 'covario', +'daylifefeedfetcher', 'daylifefeedfetcher', +'discobot', 'discobot', +'dlvr\.it', 'dlvr.it', +'dreamwidth', 'dreamwidth', +'drupal', 'Drupal Site', +'ezoom', 'ezoom', +'feedmyinbox', 'feedmyinbox', +'feedroll\.com', 'feedroll.com', +'feedzira', 'feedzira', +'fever\/', 'Feed a Fever', +'freenews', 'freenews', +'geohasher', 'geohasher', +'hanrss', 'hanrss', +'inagist', 'inagist', +'jacobin\sclub', 'jacobin club', +'jakarta', 'jakarta', +'js\-kit', 'js-kit', +'largesmall\scrawler', 'largesmall crawler', +'linkedinbot', 'linkedinbot', +'longurl', 'longurl', +'metauri', 'metauri', +'microsoft\-webdav\-miniredir', 'microsoft-webdav-miniredir', +'^motorola$', 'Suspected Bot masquerading as "Motorola"', +'movabletype', 'movabletype', +'^mozilla\/3\.0\s\(compatible$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/4\.0$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/4\.0\s\(compatible;\)$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/5\.0$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/5\.0\s\(compatible;$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/5\.0\s\(en\-us\)$', 'Suspected bot masqurading as Mozilla', +'^mozilla\/5\.0\sfirefox\/3\.0\.5$', 'Suspected bot masqurading as Mozilla', +'^msie', 'Suspected bot masquerading as M$ IE', +'netnewswire', 'netnewswire', +'\snetseer\s', 'Net Seer', +'netvibes', 'netvibes', +'newrelicpinger', 'newrelicpinger', +'newsfox', 'Fox News', +'nextgensearchbot', 'nextgensearchbot', +'ning', 'ning', +'pingdom', 'pingdom', +'pita', 'pita (pain in the ass?)', +'postpost', 'postpost', +'postrank', 'postrank', +'printfulbot', 'printfulbot', +'protopage', 'protopage', +'proximic', 'Proximic Spider', +'quipply', 'quipply', +'r6\_', 'Radian 6 Crawler', +'ratingburner', 'ratingburner', +'regator', 'regator', +'rome\sclient', 'rome client', +'rpt\-httpclient', 'rpt-httpclient', +'rssgraffiti', 'rssgraffiti', +'sage\+\+', 'sage++', +'scoutjet', 'ScoutJet crawler for Blekko.', +'simplepie', 'simplepie', +'sitebot', 'sitebot', +'summify\.com', 'summify.com', +'superfeedr', 'superfeedr', +'synthesio', 'synthesio', +'teoma', 'teoma', +'topblogsinfo', 'topblogsinfo', +'topix\.net', 'topix.net', +'trapit', 'trapit', +'trileet', 'trileet', +'tweetedtimes', 'The Tweeted Times', +'twisted\spagegetter', 'twisted pagegetter', +'twitterbot', 'Twitterbot', +'twitterfeed', 'twitterfeed', +'unwindfetchor', 'unwindfetchor', +'wazzup', 'wazzup', +'windows\-rss\-platform', 'windows-rss-platform', +'wiumi', 'wiumi', +'xydo', 'xydo', +'yahoo!\sslurp', 'Additional Yahoo bots.', +'yahoo\spipes', 'Additional Yahoo bots.', +'yahoo\-newscrawler', 'Additional Yahoo bots.', +'yahoocachesystem', 'Additional Yahoo bots.', +'yahooexternalcache', 'Additional Yahoo bots.', +'yahoo!\ssearchmonkey', 'Additional Yahoo bots.', +'yahooysmcm', 'Additional Yahoo bots.', +'yammer', 'yammer', +#'yandexbot', 'yandexbot', #already covered by 'yandex' +'yeti', 'yeti', +'yie8', 'yie8', +'youdao', 'youdao', +'yourls', 'yourls', +'zemanta', 'zemanta', +'zend_http_client', 'Zend Http Client', +'no_user_agent','Unknown robot (identified by empty user agent string)', +# Unknown robots identified by hit on robots.txt +'unknown', 'Unknown robot (identified by hit on \'robots.txt\')' +); + + +# RobotsAffiliateLib +# This list try to tell by which Search Engine a robot is used +#------------------------------------------------------------- +%RobotsAffiliateLib = ( +'bingpreview'=>'Bing', +'fast\-webcrawler'=>'AllTheWeb', +'googlebot'=>'Google', +'google\-sitemap'=>'Google', +'google[_+\s]web[_+\s]preview'=>'Google', +'msnbot'=>'MSN', +'nutch'=>'Looksmart', +'scooter'=>'AltaVista', +'wisenutbot'=>'Looksmart', +'yahoo\-blogs'=>'Yahoo', +'yahoo\-verticalcrawler'=>'Yahoo', +'yahoofeedseeker'=>'Yahoo', +'yahooseeker\-testing'=>'Yahoo', +'yahooseeker'=>'Yahoo', +'yahoo\-mmcrawler'=>'Yahoo', +'yahoo!_mindset'=>'Yahoo', +'zyborg'=>'Looksmart', +'cfetch'=>'Kosmix', +'^voyager\/'=>'Kosmix', +# Additional bots found by Sussex. +'feedfetcher\-google'=>'Google', +'bingbot'=>'MSN', +'twitterbot'=>'Twitter', +'twitterfeed'=>'Twitter', +'yahoo!\sslurp'=>'Yahoo', +'yahoo\spipes'=>'Yahoo', +'yahoo-newscrawler'=>'Yahoo', +'yahoocachesystem'=>'Yahoo', +'yahooexternalcache'=>'Yahoo', +'yahoo!\ssearchmonkey'=>'Yahoo', +'yahooysmcm'=>'Yahoo' +); + +1; diff --git a/wwwroot/cgi-bin/lib/search_engines.pm b/wwwroot/cgi-bin/lib/search_engines.pm index a84dc4e76..e56b00801 100644 --- a/wwwroot/cgi-bin/lib/search_engines.pm +++ b/wwwroot/cgi-bin/lib/search_engines.pm @@ -1,1578 +1,1578 @@ -# AWSTATS SEARCH ENGINES DATABASE -#------------------------------------------------------------------------------ -# If you want to add a Search Engine to extend AWStats database detection capabilities, -# you must add an entry in SearchEnginesSearchIDOrder, SearchEnginesHashID and in -# SearchEnginesHashLib. -# An entry if known in SearchEnginesKnownUrl is also welcome. -# -# to eldy: Please check if the following description is correct: -# You need the following information to specify a search engine: -# (a) A regular expression that matches the referrer string of the -# search engine. Unclear: What about slashes in the name of -# a search engine, e.g. as in 'ecosia.com/search'. Seems that -# AWStats will non find search strings containing a slash. -# Maybe use a search string without a slash, and - if necessary - -# an entry in %NotSearchEnginesKeys , if this search string -# matches entries that are not search engines. -# Example of a web address of a Amazon search engine: -# http://www.amazon.de/gp/bit/apps/web/SERP/search/ref=bit_bds-p24_serp_cr_de?ie=UTF8tagbase=bds-p24&query=deutsch+8.+klasse+gymnasium+protokoll -# (b) A unique string to identify the search engine within AWStats -# (c) A regular expression that finds the start of the query part in the -# referrer string -# (d) A HTML-fragment that goes into the reports generated by AWStats which -# identifies the search engine to human reader of the report. In the -# simplest case this is a string containing the name of the search -# engine. You can also provide a hypertext clause that presents the -# name together with a link to the search engine. -# -# The regular expression (a) goes into SearchEnginesSearchIDOrder_list1 -# or ..._list2. List 1 contains common search engines, list 2 those -# that are not so often used. -# -# SearchEnginesHashID contains to consecutive entries for each search -# engine: The regular expression (a) followed bei the search engine -# identifier (b) -# -# SearchEnginesKnownUrl specifies how to find the start of the query. -# For each search engine you enter the search engine identifier (b) -# followed by the regular expression (c). Unclear: It is possible to -# omit this entry. If you do this, how will AWStats find the start of -# the query? -# -# SearchEnginesHashLib contains also two entries for each search engine: -# The search engine identifier (b) followed by the HTML-Fragment (d) -# -# There are search engines that do not use a query part in their URLs. -# They put the search expression in the main part of the URL instead. -# AWStats is able to handle these cases. They are specified as described -# above, except the following two things: -# - The regular expression (c) searches the complete URL and not only -# the query part. -# - An additional Entry in the list %SearchEnginesWithKeysNotInQuery is -# necessary. -# -# -# AWStats runs a sanity check of the contents of search_engines.pm. This -# check detects the following things: -# - Inconsistencies (number of entries) -# It does not detect the following errors: -# - If the HTML-Fragment (d) is syntactically incorrect. -# -#------------------------------------------------------------------------------ - -# 2005-08-19 Sean Carlos http://www.antezeta.com/awstats.html -# added minor italian search engines -# arianna http://arianna.libero.it/ -# supereva http://search.supereva.com/ -# kataweb http://kataweb.it/ -# corrected uk looksmart -# 'askuk','ask=', 'bbc','q=', 'freeserve','q=', 'looksmart','key=', -# to -# 'askuk','ask=', 'bbc','q=', 'freeserve','q=', 'looksmartuk','key=', -# corrected spelling -# internationnal -> international -# added 'google\.'=>'mail\.google\.', to NotSearchEnginesKeys in order to -# avoid counting gmail referrals as search engine traffic -# 2005-08-21 Sean Carlos http://www.antezeta.com/awstats.html -# avoid counting babelfish.altavista referrals as search engine traffic -# avoid counting translate.google referrals as search engine traffic -# 2005-11-20 Sean Carlos -# added missing 'tiscali','key=', entry. Check order -# 2005-11-22 Sean Carlos -# added Google Base & Froogle. Froogle not tested. -# 2006-04-18 Sean Carlos http://www.antezeta.com/awstats.html -# added biglotron.com (France) -# added blingo http://www.blingo.com/ -# added Clusty & Vivisimo -# added eniro.no (Norway) [https://sourceforge.net/forum/message.php?msg_id=3134783] -# added GPU p2p search http://search.centraldatabase.org/ -# added mail.tiscali to "not search engines list" [https://sourceforge.net/forum/message.php?msg_id=3166688] -# added Ask group's "mysearch" -# added sify.com (India) -# added sogou.com (Cina) [https://sourceforge.net/forum/message.php?msg_id=3501603] -# Ask changes: -# - added Ask Japan (ask.jp) -# - break out Ask new country level variants (DE, ES, FR, IT, NL) -# - updated Ask name from Ask Jevees -# - added Ask q= parameter - many recent searches probably not recognized; [https://sourceforge.net/forum/message.php?msg_id=3465444] -# - updated Ask uk (new uk.ask.com added to older ask.co.uk) -# updated voila kw|rdata parameter [https://sourceforge.net/forum/message.php?msg_id=3373912] -# for each new engine, added link to Search Engine. This serves to document engine. Done for major & Italian engines as well. Requires patch -# to AWStats to allow untranslated html. Otherwise html will appear instead of link. -# reviewed mnoGoSearch (http://www.mnogosearch.org/); the search engined mentioned no longer -# exists https://sourceforge.net/forum/message.php?msg_id=3025426 -# 2006-05-13 Sean Carlos http://www.antezeta.com/awstats.html -# added 10 Chello European broadband portals (Austria, Belgium, Czech Republic, France, Hungary, The Netherlands, Norway, Poland, Slovakia, Sweden) -# added Alice Internal Search (blends data with Google?) search.alice.it.master:10005 -# added detection of google cache views from IPs 66.249.93.104 72.14.203.104 72.14.207.104 -# To do: add more extensive IP list; keywords not yet detected. -# added icerocket.com blog search http://www.icerocket.com/ -# added live.com (msn) http://www.live.com/ -# added Meta motor kartoo. Note: Kartoo does not provide search words in referrers, thus the engine will appear in the -# search engine list but the actual search words are not available. -# added netluchs.de http://www.netluchs.de/ -# added sphere.com blog search http://www.sphere.com/ -# added wwweasel.de http://wwweasel.de -# added Yahoo Mindset! http://mindset.research.yahoo.com/ -# updated Mirago query parameter recognition (qry=); added breakout for each country (France, Germany, Spain, Italy, Norway, Sweden, Denmark, Netherlands, Belgium, Switzerland) -# 2006-05-13 Sean Carlos http://www.antezeta.com/awstats.html -# added Google cache IPs 64.233.183.104 & 66.102.7.104 -# 2006-05-20 Sean Carlos http://www.antezeta.com/awstats.html -# anzwers.com.au -# schoenerbrausen.de http://www.schoenerbrausen.de/ -# added Google cache IP 216.239.59.104 -# answerbus http://www.answerbus.com/ (does not provide keywords) -# 2006-05-23 Sean Carlos http://www.antezeta.com/awstats.html -# added Google cache IP 66.102.9.104, 64.233.161.104 -# 2006-06-23 Sean Carlos http://www.antezeta.com/awstats.html -# added Alice Search search.alice.it -# added GoodSearch http://www.goodsearch.com/ (does not provide keywords) "a Yahoo-powered search engine that donates money to your favorite charity or school each time you search the web" -# added googlee.com, variant of Google -# added gotuneed http://www.gotuneed.com/ Italian search engine, in beta -# added icq.com -# added logic to parse Google Cache search keywords. Seems to work for alpha but not numeric cache IDs, i.e. search?q=cache:lWVLmnuGJswJ: is recognized but q=cache:Yv5qxeJNuhgJ: is not recognized. The URL triggering the keywords will also appear. The URLs are probably too varied to parse out? -# added Nusearch http://www.nusearch.com/ -# added Polymeta www.polymeta.hu (does not provide keywords) -# added scroogle http://www.scroogle.org/ (does not always provide keywords) -# added Tango http://tango.hu/search.php?st=0&q=jeles+napok -# Changed Google Cache notation 64\.233\.(161|167|179|183|187)\.104 to 64\.233\.1[0-9]{2}\.104 -# 72\.14\.(203|205|207|209|221)\.104 to 72\.14\.2[0-9]{2}\.104 -# 216\.239\.(51|59)\.104 to 216\.239\.5[0-9]\.104 -# 66\.102\.(7|9)\.104 to 66\.102\.[1-9]\.104 -# 2006-06-27 Sean Carlos http://www.antezeta.com/awstats.html -# added Onet.pl http://szukaj.onet.pl/ -# corrected name "Wirtualna Polska" from "Szukaj" (search); added link http://szukaj.wp.pl/ -# 2006-06-30 Sean Carlos http://www.antezeta.com/awstats.html -# Additional Polish Search Engines: -# added Dodaj.pl http://www.dodaj.pl/ -# added Gazeta.pl http://szukaj.gazeta.pl/ -# added Gery.pl http://szukaj.gery.pl/ -# added Hoga.pl http://www.hoga.pl/ -# added Interia.pl http://www.google.interia.pl/ -# added Katalog.Onet.pl http://katalog.onet.pl/ -# added NetSprint.pl http://www.netsprint.pl/ -# added o2.pl http://szukaj2.o2.pl/ -# added Polska http://szukaj.polska.pl/ -# added Szukacz http://www.szukacz.pl/ -# added Wow.pl http://szukaj.wow.pl/ -# added Sagool http://sagool.jp/ - -# 2006-08-25 Social Bookmarks -# International -# added del.icio.us/search - for now, just search referrer. To do: consider /tag/(tagname) referrer? -# added stumbleupon.com - No keywords supplied. -# added swik.net -# added digg. Keywords sometimes supplied. -# Italy -# added segnalo.alice.it - No keywords supplied. -# added ineffabile.it - No keywords supplied. - -# added filter for google groups. Attempt to parse group name as keyword. - -# 2006-09-14 -# added Eniro Sverige http://www.eniro.se/ -# added MyWebSearch http://search.mywebsearch.com/ -# added Teecno http://www.teecno.it/ Italian Open Source Search Engine - -#package AWSSE; - -# 2006-09-25 (Gabor Moizes) -# added 4-counter (Google alternative) http://4-counter.com/ -# added Googlecom (Google alternative) http://googlecom.com/ -# added Goggle (Google alternative) http://goggle.co.hu/ -# added Comet toolbar http://as.starware.com -# added new IP for Yahoo: 216.109.125.130 -# added Ledix http://ledix.net/ -# added AT&T search (powered by Google) http://www.att.net/ -# added Keresolap (Hungarian search engine) http://www.keresolap.hu/ -# added Mozbot (French search engine) http://www.mozbot.fr/ -# added Zoznam (Slovak search engine) http://www.zoznam.sk/ -# added sapo.pt (Portuguese search engine) http://www.sapo.pt/ -# added shaw.ca (powered by Google) http://start.shaw.ca/ -# added Searchalot http://www.searchalot.com/ -# added Copernic http://www.copernic.com/ -# added 216.109.125.130 to Yahoo -# added 66.218.69.11 to Yahoo -# added Avantfind http://www.avantfind.com/ -# added Steadysearch http://www.steadysearch.com/ -# added Steadysearch http://www.steady-search.com/ -# modified 216\.239\.5[0-9]\.104/search to 216\.239\.5[0-9]\.104 - - -# SearchEnginesSearchIDOrder -# It contains all matching criteria to search for in log fields. This list is -# used to know in which order to search Search Engines IDs. -# Most frequent one are in list1, used when LevelForSearchEnginesDetection is 1 or more -# Minor robots are in list2, used when LevelForSearchEnginesDetection is 2 or more -# Note: Regex IDs are in lower case and ' ' and '+' are changed into '_' -#------------------------------------------------------------------------------ -@SearchEnginesSearchIDOrder_list1=( -# Major international search engines -'google\.[\w.]+/products', -'base\.google\.', -'froogle\.google\.', -'groups\.google\.', -'images\.google\.', -'google\.', -'googlee\.', -'googlecom\.com', -'goggle\.co\.hu', -'216\.239\.32\.20', -'173\.194\.32\.223', -'216\.239\.(35|37|39|51)\.100', -'216\.239\.(35|37|39|51)\.101', -'216\.239\.5[0-9]\.104', -'64\.233\.1[0-9]{2}\.104', -'66\.102\.[1-9]\.104', -'66\.249\.93\.104', -'72\.14\.2[0-9]{2}\.104', -'msn\.', -'live\.com', -'bing\.', -'voila\.', -'mindset\.research\.yahoo', -'yahoo\.','(66\.218\.71\.225|216\.109\.117\.135|216\.109\.125\.130|66\.218\.69\.11)', -'search\.aol\.co', -'tiscali\.', -'lycos\.', -'alexa\.com', -'alltheweb\.com', -'altavista\.', -'a9\.com', -'dmoz\.org', -'netscape\.', -'search\.terra\.', -'www\.search\.com', -'search\.sli\.sympatico\.ca', -'excite\.' -); - -@SearchEnginesSearchIDOrder_list2=( -# Minor international search engines -'4\-counter\.com', -'att\.net', -'bungeebonesdotcom', -'northernlight\.', -'hotbot\.', -'kvasir\.', -'webcrawler\.', -'metacrawler\.', -'go2net\.com', -'(^|\.)go\.com', -'euroseek\.', -'looksmart\.', -'spray\.', -'nbci\.com\/search', -'de\.ask.\com', # break out Ask country specific engines. (.jp is in Japan section) -'es\.ask.\com', -'fr\.ask.\com', -'it\.ask.\com', -'nl\.ask.\com', -'uk\.ask.\com', -'(^|\.)ask\.com', -'atomz\.', -'overture\.com', # Replace 'goto\.com','Goto.com', -'teoma\.', -'findarticles\.com', -'infospace\.com', -'mamma\.', -'dejanews\.', -'dogpile\.com', -'wisenut\.com', -'ixquick\.com', -'search\.earthlink\.net', -'i-une\.com', -'blingo\.com', -'centraldatabase\.org', -'clusty\.com', -'mysearch\.', -'vivisimo\.com', -'kartoo\.com', -'icerocket\.com', -'sphere\.com', -'ledix\.net', -'start\.shaw\.ca', -'searchalot\.com', -'copernic\.com', -'avantfind\.com', -'steadysearch\.com', -'steady-search\.com', -'claro-search\.com', -'www1\.search-results\.com', -'www\.holasearch\.com', -'search\.conduit\.com', -'static\.flipora\.com', -'(?:www[12]?|mixidj)\.delta-search\.com', -'start\.iminent\.com', -'www\.searchmobileonline\.com', -'int\.search-results\.com', -'www2\.inbox\.com', -'www\.govome\.com', -'find1friend\.com', -'start\.mysearchdial\.com', -'go\.speedbit\.com', -'search\.certified-toolbar\.com', -'search\.sweetim\.com', -'search\.searchcompletion\.com', -'en\.eazel\.com', -'sr\.searchfunmoods\.com', -'173\.194\.35\.177', -'dalesearch\.com', -'sweetpacks-search\.com', -'searchgol\.com', -'duckduckgo\.com', -'sr\.facemoods\.com', -'shoppstop\.com', -'searchya\.com', -'picsearch\.de', -'webssearches\.com', -'airzip\.inspsearch\.com', -'zapmeta\.de', -'localmoxie\.com', -'search-results\.mobi', -'androidsearch\.com', -'isearch\.nation\.com', -'search\.zonealarm\.com', -'www\.buenosearch\.com', -'search\.foxtab\.com', -'searches\.qone8\.com', -'startpage\.com', -'www\.qwant\.com', -'searches\.safehomepage\.com', -'searches\.vi-view\.com', -'wow\.utop\.it', -'windowssearch\.com', -'www\.wow\.com', -'globososo\.', -'kingtale3\.inspsearch\.com', -'swisscows\.ch', -'preciobarato\.xyz', -'www\.dregol\.com', -'search\.socialdownloadr\.com', -'int\.search\.myway\.com', -'de\.dolphin\.com', -'mys\.yoursearch\.me', -# Chello Portals -'chello\.at', -'chello\.be', -'chello\.cz', -'chello\.fr', -'chello\.hu', -'chello\.nl', -'chello\.no', -'chello\.pl', -'chello\.se', -'chello\.sk', -'chello', # required as catchall for new countries not yet known -# Mirago -'mirago\.be', -'mirago\.ch', -'mirago\.de', -'mirago\.dk', -'es\.mirago\.com', -'mirago\.fr', -'mirago\.it', -'mirago\.nl', -'no\.mirago\.com', -'mirago\.se', -'mirago\.co\.uk', -'mirago', # required as catchall for new countries not yet known -'answerbus\.com', -'icq\.com\/search', -'nusearch\.com', -'goodsearch\.com', -'scroogle\.org', -'questionanswering\.com', -'mywebsearch\.com', -'as\.starware\.com', -# Social Bookmarking Services -'del\.icio\.us', -'digg\.com', -'stumbleupon\.com', -'swik\.net', -'segnalo\.alice\.it', -'ineffabile\.it', -# Minor Australian search engines -'anzwers\.com\.au', -# Minor brazilian search engines -'engine\.exe', 'miner\.bol\.com\.br', -# Minor chinese search engines -'\.baidu\.com', # baidu search portal -'\.vnet\.cn', # powered by MSN -'\.soso\.com', # powered by Google -'\.sogou\.com', # powered by Sohu -'\.3721\.com', # powered by Yahoo! -'iask\.com', # powered by Sina -'\.accoona\.com', # Accoona -'\.163\.com', # powered by Google -'\.zhongsou\.com', # zhongsou search portal -# Minor czech search engines -'atlas\.cz','seznam\.cz','quick\.cz','centrum\.cz','jyxo\.(cz|com)','najdi\.to','redbox\.cz', -'isearch\.avg\.com', -# Minor danish search-engines -'opasia\.dk', 'danielsen\.com', 'sol\.dk', 'jubii\.dk', 'find\.dk', 'edderkoppen\.dk', 'netstjernen\.dk', 'orbis\.dk', 'tyfon\.dk', '1klik\.dk', 'ofir\.dk', -# Minor dutch search engines -'ilse\.','vindex\.', -# Minor english search engines -'(^|\.)ask\.co\.uk','bbc\.co\.uk/cgi-bin/search','ifind\.freeserve','looksmart\.co\.uk','splut\.','spotjockey\.','ukdirectory\.','ukindex\.co\.uk','ukplus\.','searchy\.co\.uk', -'search\.fbdownloader\.com', -'search\.fdownloadr\.com', -'search\.babylon\.com', -'my\.allgameshome\.com', -'surfcanyon\.com', -'uk\.foxstart\.com', -'yandex\.com', -# Minor finnish search engines -'haku\.www\.fi', -# Minor french search engines -'recherche\.aol\.fr','ctrouve\.','francite\.','\.lbb\.org','rechercher\.libertysurf\.fr', 'search[\w\-]+\.free\.fr', 'recherche\.club-internet\.fr', -'toile\.com', 'biglotron\.com', -'mozbot\.fr', -# Minor german search engines -'sucheaol\.aol\.de', -'o2suche\.aol\.de', -'fireball\.de','infoseek\.de','suche\d?\.web\.de','[a-z]serv\.rrzn\.uni-hannover\.de', -'suchen\.abacho\.de','(brisbane|suche)\.t-online\.de','allesklar\.de','meinestadt\.de', -'212\.227\.33\.241', -'(161\.58\.227\.204|161\.58\.247\.101|212\.40\.165\.90|213\.133\.108\.202|217\.160\.108\.151|217\.160\.111\.99|217\.160\.131\.108|217\.160\.142\.227|217\.160\.176\.42)', -'wwweasel\.de', -'netluchs\.de', -'schoenerbrausen\.de', -'suche\.gmx\.net', -'suche\.gmx\.at', -'ecosia\.org', -'de\.aolsearch\.com', -'suche\.aol\.de', -'www\.startxxl\.com', -'www\.benefind\.de', -'www\.amazon\.de.*search', #Just as a reminder, probably will not work as AWstats seems to consider the host part of an URL only -'de\.wow\.com', -'www\.vlips\.de', -'metager\.de', -'search\.1und1\.de', -'sm\.de', -'sumaja\.de', -'navigationshilfe\.t-online\.de', -'umfis\.de', -'fastbot\.de', -'tixuma\.de', -'suche\.freenet\.de', -'www\.izito\.de', -'extern\.peoplecheck\.de', -'www\.oneseek\.de', -'de\.wiki\.gov\.cn', -'umuwa\.de', -'suche\.1und1\.de', -'www\.metasuche\.ch', -# Minor Hungarian search engines -'heureka\.hu','vizsla\.origo\.hu','lapkereso\.hu','goliat\.hu','index\.hu','wahoo\.hu','webmania\.hu','search\.internetto\.hu', -'tango\.hu', -'keresolap\.hu', -'kereso\.startlap\.hu', -'polymeta\.hu', -# Minor Indian search engines -'sify\.com', -# Minor Italian search engines -'virgilio\.it','arianna\.libero\.it','supereva\.com','kataweb\.it','search\.alice\.it\.master','search\.alice\.it','gotuneed\.com', -'godado','jumpy\.it','shinyseek\.it','teecno\.it', -# Minor Israeli search engines -'search\.genieo\.com', -# Minor Japanese search engines -'ask\.jp','sagool\.jp', -'websearch\.rakuten\.co\.jp', -# Minor Norwegian search engines -'sok\.start\.no', 'eniro\.no', -# Minor Polish search engines -'szukaj\.wp\.pl','szukaj\.onet\.pl','dodaj\.pl','gazeta\.pl','gery\.pl','hoga\.pl','netsprint\.pl','interia\.pl','katalog\.onet\.pl','o2\.pl','polska\.pl','szukacz\.pl','wow\.pl', -# Minor russian search engines -'ya(ndex)?\.ru', 'aport\.ru', 'rambler\.ru', 'turtle\.ru', 'metabot\.ru', -'go\.mail\.ru', -# Minor Swedish search engines -'evreka\.passagen\.se','eniro\.se', -# Minor Slovak search engines -'zoznam\.sk', -# Minor Portuguese search engines -'sapo\.pt', -# Minor swiss search engines -'search\.ch', 'search\.bluewin\.ch', -'www\.zapmeta\.ch', -'etools\.ch', -# Minor Croatian, Serbian, Macedonian, Bosnian and Herzegovinian search engines -'pogodak\.' -); -@SearchEnginesSearchIDOrder_listgen=( -# Generic search engines -'search\..*\.\w+' -); - - -# NotSearchEnginesKeys -# If a search engine key is found, we check its exclude list to know if it's -# really a search engine -#------------------------------------------------------------------------------ -%NotSearchEnginesKeys=( -'altavista\.'=>'babelfish\.altavista\.', -'google\.'=>'mail\.google\.', -'google\.'=>'translate\.google\.', -'google\.'=>'code\.google\.', -'google\.'=>'groups\.google\.', -'msn\.'=>'hotmail\.msn\.', -'tiscali\.'=>'mail\.tiscali\.', -'yahoo\.'=>'(?:picks|mail)\.yahoo\.|yahoo\.[^/]+/picks', -'yandex\.'=>'direct\.yandex\.' -); - - -# SearchEnginesHashID -# Each Search Engine Search ID is associated to an AWStats id string -#------------------------------------------------------------------------------ -%SearchEnginesHashID = ( -# Major international search engines -'google\.[\w.]+/products','google_products', -'base\.google\.','google_base', -'froogle\.google\.','google_froogle', -'groups\.google\.','google_groups', -'images\.google\.','google_image', -'google\.','google', -'googlee\.','google', -'googlecom\.com','google', -'goggle\.co\.hu','google', -'216\.239\.32\.20', 'google', -'173\.194\.32\.223', 'google', -'216\.239\.(35|37|39|51)\.100','google_cache', -'216\.239\.(35|37|39|51)\.101','google_cache', -'216\.239\.5[0-9]\.104','google_cache', -'64\.233\.1[0-9]{2}\.104','google_cache', -'66\.102\.[1-9]\.104','google_cache', -'66\.249\.93\.104','google_cache', -'72\.14\.2[0-9]{2}\.104','google_cache', -'msn\.','msn', -'live\.com','live', -'bing\.','bing', -'voila\.','voila', -'mindset\.research\.yahoo','yahoo_mindset', -'yahoo\.','yahoo','(66\.218\.71\.225|216\.109\.117\.135|216\.109\.125\.130|66\.218\.69\.11)','yahoo', -'lycos\.','lycos', -'alexa\.com','alexa', -'alltheweb\.com','alltheweb', -'altavista\.','altavista', -'a9\.com','a9', -'dmoz\.org','dmoz', -'netscape\.','netscape', -'search\.terra\.','terra', -'www\.search\.com','search.com', -'tiscali\.','tiscali', -'search\.aol\.co','aol', -'search\.sli\.sympatico\.ca','sympatico', -'excite\.','excite', -# Minor international search engines -'4\-counter\.com','google4counter', -'att\.net','att', -'bungeebonesdotcom','bungeebonesdotcom', -'northernlight\.','northernlight', -'hotbot\.','hotbot', -'kvasir\.','kvasir', -'webcrawler\.','webcrawler', -'metacrawler\.','metacrawler', -'go2net\.com','go2net', -'(^|\.)go\.com','go', -'euroseek\.','euroseek', -'looksmart\.','looksmart', -'spray\.','spray', -'nbci\.com\/search','nbci', -'de\.ask.\com','askde', # break out Ask country specific engines. -'es\.ask.\com','askes', -'fr\.ask.\com','askfr', -'it\.ask.\com','askit', -'nl\.ask.\com','asknl', -'uk\.ask.\com','askuk', -'(^|\.)ask\.co\.uk','askuk', -'(^|\.)ask\.com','ask', -'atomz\.','atomz', -'overture\.com','overture', # Replace 'goto\.com','Goto.com', -'teoma\.','teoma', -'findarticles\.com','findarticles', -'infospace\.com','infospace', -'mamma\.','mamma', -'dejanews\.','dejanews', -'dogpile\.com','dogpile', -'wisenut\.com','wisenut', -'ixquick\.com','ixquick', -'search\.earthlink\.net','earthlink', -'i-une\.com','iune', -'blingo\.com','blingo', -'centraldatabase\.org','centraldatabase', -'clusty\.com','clusty', -'mysearch\.','mysearch', -'vivisimo\.com','vivisimo', -'kartoo\.com','kartoo', -'icerocket\.com','icerocket', -'sphere\.com','sphere', -'ledix\.net','ledix', -'start\.shaw\.ca','shawca', -'searchalot\.com','searchalot', -'copernic\.com','copernic', -'avantfind\.com','avantfind', -'steadysearch\.com','steadysearch', -'steady-search\.com','steadysearch', -'claro-search\.com','clarosearch', -'www1\.search-results\.com', 'searchresults', -'www\.holasearch\.com', 'holasearch', -'search\.conduit\.com', 'conduit', -'static\.flipora\.com', 'flipora', -'(?:www[12]?|mixidj)\.delta-search\.com', 'delta-search', -'start\.iminent\.com', 'iminent', -'www\.searchmobileonline\.com', 'searchmobileonline', -'int\.search-results\.com', 'nortonsavesearch', -'www2\.inbox\.com', 'inbox', -'www\.govome\.com', 'govome', -'find1friend\.com', 'find1friend', -'start\.mysearchdial\.com', 'mysearchdial', -'go\.speedbit\.com', 'speedbit', -'search\.certified-toolbar\.com', 'certifiedtoolbarsearch', -'search\.sweetim\.com', 'sweetim', -'search\.searchcompletion\.com', 'searchcompletion', -'en\.eazel\.com','eazelsearch', -'sr\.searchfunmoods\.com', 'searchfunmoods', -'173\.194\.35\.177', 'googleByIP', -'dalesearch\.com', 'dalesearch', -'sweetpacks-search\.com', 'sweetpacks', -'searchgol\.com', 'searchgol', -'duckduckgo\.com', 'duckduckgo', -'sr\.facemoods\.com', 'facemoods', -'shoppstop\.com', 'shoppstop', -'searchya\.com', 'searchya', -'picsearch\.de', 'picsearch', -'webssearches\.com', 'webssearches', -'airzip\.inspsearch\.com', 'webssearches', -'zapmeta\.de', 'zapmeta', -'localmoxie\.com', 'localmoxie', -'search-results\.mobi', 'search-results_mobi', -'androidsearch\.com', 'androidsearch', -'isearch\.nation\.com', 'isearch_nation_com', -'search\.zonealarm\.com', 'search_zonealarm_com', -'www\.buenosearch\.com', 'www_buenosearch_com', -'search\.foxtab\.com', 'search_foxtab_com', -'searches\.qone8\.com', 'searches_qone8_com', -'startpage\.com', 'startpage_com', -'www\.qwant\.com', 'qwant_com', -'searches\.safehomepage\.com', 'safehomepage_com', -'searches\.vi-view\.com', 'vi-view_com', -'wow\.utop\.it', 'wow_utop_it', -'windowssearch\.com', 'windowssearch_com', -'www\.wow\.com', 'www_wow_com', -'globososo\.', 'globososo', -'kingtale3\.inspsearch\.com', 'globososo', -'swisscows\.ch', 'swisscows_ch', -'preciobarato\.xyz', 'preciobarato_xyz', -'www\.dregol\.com', 'www_dregol_com', -'search\.socialdownloadr\.com', 'search_socialdownloadr_com', -'int\.search\.myway\.com', 'int_search_myway_com', -'de\.dolphin\.com', 'de_dolphin_com', -'mys\.yoursearch\.me', 'mys_yoursearch_me', -# Chello Portals -'chello\.at','chelloat', -'chello\.be','chellobe', -'chello\.cz','chellocz', -'chello\.fr','chellofr', -'chello\.hu','chellohu', -'chello\.nl','chellonl', -'chello\.no','chellono', -'chello\.pl','chellopl', -'chello\.se','chellose', -'chello\.sk','chellosk', -'chello','chellocom', -# Mirago -'mirago\.be','miragobe', -'mirago\.ch','miragoch', -'mirago\.de','miragode', -'mirago\.dk','miragodk', -'es\.mirago\.com','miragoes', -'mirago\.fr','miragofr', -'mirago\.it','miragoit', -'mirago\.nl','miragonl', -'no\.mirago\.com','miragono', -'mirago\.se','miragose', -'mirago\.co\.uk','miragocouk', -'mirago','mirago', # required as catchall for new countries not yet known -'answerbus\.com','answerbus', -'icq\.com\/search','icq', -'nusearch\.com','nusearch', -'goodsearch\.com','goodsearch', -'scroogle\.org','scroogle', -'questionanswering\.com','questionanswering', -'mywebsearch\.com','mywebsearch', -'as\.starware\.com','comettoolbar', -# Social Bookmarking Services -'del\.icio\.us','delicious', -'digg\.com','digg', -'stumbleupon\.com','stumbleupon', -'swik\.net','swik', -'segnalo\.alice\.it','segnalo', -'ineffabile\.it','ineffabile', -# Minor Australian search engines -'anzwers\.com\.au','anzwers', -# Minor brazilian search engines -'engine\.exe','engine', -'miner\.bol\.com\.br','miner', -# Minor chinese search engines -'\.baidu\.com','baidu', -'iask\.com','iask', -'\.accoona\.com','accoona', -'\.3721\.com','3721', -'\.163\.com','netease', -'\.soso\.com','soso', -'\.zhongsou\.com','zhongsou', -'\.vnet\.cn','vnet', -'\.sogou\.com','sogou', -# Minor czech search engines -'atlas\.cz','atlas', -'seznam\.cz','seznam', -'quick\.cz','quick', -'centrum\.cz','centrum', -'jyxo\.(cz|com)','jyxo', -'najdi\.to','najdi', -'redbox\.cz','redbox', -'isearch\.avg\.com', 'avgsearch', -# Minor danish search-engines -'opasia\.dk','opasia', -'danielsen\.com','danielsen', -'sol\.dk','sol', -'jubii\.dk','jubii', -'find\.dk','finddk', -'edderkoppen\.dk','edderkoppen', -'netstjernen\.dk','netstjernen', -'orbis\.dk','orbis', -'tyfon\.dk','tyfon', -'1klik\.dk','1klik', -'ofir\.dk','ofir', -# Minor dutch search engines -'ilse\.','ilse', -'vindex\.','vindex', -# Minor english search engines -'bbc\.co\.uk/cgi-bin/search','bbc', -'ifind\.freeserve','freeserve', -'looksmart\.co\.uk','looksmartuk', -'splut\.','splut', -'spotjockey\.','spotjockey', -'ukdirectory\.','ukdirectory', -'ukindex\.co\.uk','ukindex', -'ukplus\.','ukplus', -'searchy\.co\.uk','searchy', -'search\.fbdownloader\.com','fbdownloader', -'search\.fdownloadr\.com', 'fdownloadr_com', -'search\.babylon\.com', 'babylon', -'my\.allgameshome\.com', 'allgameshome', -'surfcanyon\.com', 'surfcanyon_com', -'uk\.foxstart\.com', 'uk_foxstart_com', -'yandex\.com', 'yandex_com', -# Minor finnish search engines -'haku\.www\.fi','haku', -# Minor french search engines -'recherche\.aol\.fr','aolfr', -'ctrouve\.','ctrouve', -'francite\.','francite', -'\.lbb\.org','lbb', -'rechercher\.libertysurf\.fr','libertysurf', -'search[\w\-]+\.free\.fr','free', -'recherche\.club-internet\.fr','clubinternet', -'toile\.com','toile', -'biglotron\.com', 'biglotron', -'mozbot\.fr', 'mozbot', -# Minor german search engines -'sucheaol\.aol\.de','aolde', -'o2suche\.aol\.de','o2aolde', -'fireball\.de','fireball', -'infoseek\.de','infoseek', -'suche\d?\.web\.de','webde', -'[a-z]serv\.rrzn\.uni-hannover\.de','meta', -'suchen\.abacho\.de','abacho', -'(brisbane|suche)\.t-online\.de','t-online', -'allesklar\.de','allesklar', -'meinestadt\.de','meinestadt', -'212\.227\.33\.241','metaspinner', -'(161\.58\.227\.204|161\.58\.247\.101|212\.40\.165\.90|213\.133\.108\.202|217\.160\.108\.151|217\.160\.111\.99|217\.160\.131\.108|217\.160\.142\.227|217\.160\.176\.42)','metacrawler_de', -'wwweasel\.de','wwweasel', -'netluchs\.de','netluchs', -'schoenerbrausen\.de','schoenerbrausen', -'suche\.gmx\.net', 'gmxsuche', -'suche\.gmx\.at', 'gmxsuche_at', -'ecosia\.org', 'ecosiasearch', -'de\.aolsearch\.com', 'aolsearch', -'suche\.aol\.de', 'aolsuche', -'www\.startxxl\.com', 'startxxl', -'www\.benefind\.de', 'benefind', -'www\.amazon\.de.*search', 'amazonsearch', #Not clear if this matches amazon searches only -'de\.wow\.com', 'wowsearch', -'www\.vlips\.de', 'vlips_de', -'metager\.de', 'metager', -'search\.1und1\.de', 'search_1und1_de', -'sm\.de', 'smde', -'sumaja\.de', 'sumaja', -'navigationshilfe\.t-online\.de', 'navigationshilfe', -'umfis\.de', 'umfis', -'fastbot\.de', 'fastbot_de', -'tixuma\.de', 'tixuma_de', -'suche\.freenet\.de', 'freenet_de', -'www\.izito\.de', 'izito_de', -'extern\.peoplecheck\.de', 'peoplecheck_de', -'www\.oneseek\.de', 'oneseek_de', -'de\.wiki\.gov\.cn', 'de_wiki_gov_cn', -'umuwa\.de', 'umuwa_de', -'suche\.1und1\.de', '1und1_de', -'www\.metasuche\.ch', 'metasuche_ch', -# Minor Hungarian search engines -'heureka\.hu','heureka', -'vizsla\.origo\.hu','origo', -'lapkereso\.hu','lapkereso', -'goliat\.hu','goliat', -'index\.hu','indexhu', -'wahoo\.hu','wahoo', -'webmania\.hu','webmania', -'search\.internetto\.hu','internetto', -'tango\.hu','tango_hu', -'keresolap\.hu','keresolap_hu', -'kereso\.startlap\.hu', 'startlap_hu', -'polymeta\.hu','polymeta_hu', -# Minor Indian search engines -'sify\.com','sify', -# Minor Italian search engines -'virgilio\.it','virgilio', -'arianna\.libero\.it','arianna', -'supereva\.com','supereva', -'kataweb\.it','kataweb', -'search\.alice\.it\.master','aliceitmaster', -'search\.alice\.it','aliceit', -'gotuneed\.com','gotuneed', -'godado','godado', -'jumpy\.it','jumpy\.it', -'shinyseek\.it','shinyseek\.it', -'teecno\.it','teecnoit', -# Minor Israeli search engines -'search\.genieo\.com', 'genieo', -# Minor Japanese search engines -'ask\.jp','askjp', -'sagool\.jp','sagool', -'websearch\.rakuten\.co\.jp', 'rakuten', -# Minor Norwegian search engines -'sok\.start\.no','start', 'eniro\.no','eniro', -# Minor Polish search engines -'szukaj\.wp\.pl','wp', -'szukaj\.onet\.pl','onetpl', -'dodaj\.pl','dodajpl', -'gazeta\.pl','gazetapl', -'gery\.pl','gerypl', -'netsprint\.pl\/hoga\-search','hogapl', -'netsprint\.pl','netsprintpl', -'interia\.pl','interiapl', -'katalog\.onet\.pl','katalogonetpl', -'o2\.pl','o2pl', -'polska\.pl','polskapl', -'szukacz\.pl','szukaczpl', -'wow\.pl','wowpl', -# Minor russian search engines -'ya(ndex)?\.ru','yandex', -'aport\.ru','aport', -'rambler\.ru','rambler', -'turtle\.ru','turtle', -'metabot\.ru','metabot', -'go\.mail\.ru', 'mailru', -# Minor Swedish search engines -'evreka\.passagen\.se','passagen', -'eniro\.se','enirose', -# Minor Slovak search engines -'zoznam\.sk','zoznam', -# Minor Portuguese search engines -'sapo\.pt','sapo', -# Minor swiss search engines -'search\.ch','searchch', -'search\.bluewin\.ch','bluewin', -'www\.zapmeta\.ch', 'zapmeta_ch', -'etools\.ch', 'etools_ch', -# Minor Croatian, Serbian, Macedonian, Bosnian and Herzegovinian search engines -'pogodak\.','pogodak', -# Generic search engines -'search\..*\.\w+','search' -); - - -# SearchEnginesWithKeysNotInQuery -# List of search engines that store keyword as page instead of query parameter -#------------------------------------------------------------------------------ -%SearchEnginesWithKeysNotInQuery=( -'a9',1, # www.a9.com/searchkey1%20searchkey2 -'iminent',1, #http://start.iminent.com/StartWeb/1031/toolbox/#q=searchkey1%20searchkey2&additional_arguments -'de_wiki_gov_cn',1, #http://de.wiki.gov.cn/s_searchkey1%20searchkey2 -'umuwa_de', 1, #http://umuwa.de/searchkey or http://umuwa.de/searchkey/Images -'amazonsearch', 1 #http://www.amazon.de/gp/bit/apps/web/SERP/search/ref=bit_bds-p24_serp_cr_de?ie=UTF8tagbase=bds-p24&query=deutsch+8.+klasse+gymnasium+protokoll -); - -# SearchEnginesKnownUrl -# Known rules to extract keywords from a referrer search engine URL -#------------------------------------------------------------------------------ -%SearchEnginesKnownUrl=( -# Most common search engines -'alexa','q=', -'alltheweb','q(|uery)=', -'altavista','q=', -'a9','a9\.com\/', -'dmoz','search=', -'google_products','(p|q|as_p|as_q)=', -'google_base','(p|q|as_p|as_q)=', -'google_froogle','(p|q|as_p|as_q)=', -'google_groups','group\/', # does not work -'google_image','(p|q|as_p|as_q)=', -'google_cache','(p|q|as_p|as_q)=cache:[0-9A-Za-z]{12}:', -'google','(p|q|as_p|as_q)=', -'lycos','query=', -'msn','q=', -'live','q=', -'bing','q=', -'netscape','search=', -'tiscali','key=', -'aol','query=', -'terra','query=', -'voila','(kw|rdata)=', -'search.com','q=', -'yahoo_mindset','p=', -'yahoo','p=', -'sympatico', 'query=', -'excite','search=', -# Minor international search engines -'google4counter','(p|q|as_p|as_q)=', -'att','qry=', -'bungeebonesdotcom','query=', -'go','qt=', -'askde','(ask|q)=', # break out Ask country specific engines. -'askes','(ask|q)=', -'askfr','(ask|q)=', -'askit','(ask|q)=', -'asknl','(ask|q)=', -'ask','(ask|q)=', -'atomz','sp-q=', -'euroseek','query=', -'findarticles','key=', -'go2net','general=', -'hotbot','mt=', -'infospace','qkw=', -'kvasir', 'q=', -'looksmart','key=', -'mamma','query=', -'metacrawler','general=', -'nbci','keyword=', -'northernlight','qr=', -'overture','keywords=', -'dogpile', 'q(|kw)=', -'spray','string=', -'teoma','q=', -'webcrawler','searchText=', -'wisenut','query=', -'ixquick', 'query=', -'earthlink', 'q=', -'iune','(keywords|q)=', -'blingo','q=', -'centraldatabase','query=', -'clusty','query=', -'mysearch','searchfor=', -'vivisimo','query=', -# kartoo: No keywords passed in referring URL. -'kartoo','', -'icerocket','q=', -'sphere','q=', -'ledix','q=', -'shawca','q=', -'searchalot','q=', -'copernic','web\/', -'avantfind','keywords=', -'steadysearch','w=', -'clarosearch','q=', -'searchresults','q=', -'holasearch', 'q=', -'conduit', 'q=', -'flipora', 'q=', -'delta-search', 'q=', -'iminent', 'q=', -'searchmobileonline', 'q=', -'nortonsavesearch', 'q=', -'inbox', 'q(?:kw)?=', -'govome', 'q=', -'find1friend', 'q=', -'mysearchdial', 'q=', -'speedbit', 'q=', -'certifiedtoolbarsearch', 'q=', -'sweetim', 'q=', -'searchcompletion', 'q=', -'eazelsearch', 'q=', -'searchfunmoods', 'q=', -'googleByIP', 'q=', -'dalesearch', 'q=', -'sweetpacks', 'q=', -'searchgol', 'q=', -'duckduckgo', 'uddg=', -'facemoods', 'q=', -'shoppstop', 'keywords=', -'searchya', 'q=', -'picsearch', 'q=', -'webssearches', 'q=', -'zapmeta', 'query=', -'localmoxie', 'keyword=', -'search-results_mobi', 'q=', -'androidsearch', 'q=', -'isearch_nation_com', 'q=', -'search_zonealarm_com', 'q=', -'www_buenosearch_com', 'q=', -'search_foxtab_com', 'q=', -'searches_qone8_com', 'q=', -'startpage_com', 'query=', -'qwant_com', 'q=', -'safehomepage_com', 'q=', -'vi-view_com', 'q=', -'wow_utop_it', 'q=', -'windowssearch_com', 'q=', -'www_wow_com', 'q=', -'globososo', 'q=', -'swisscows_ch', 'query=', -'preciobarato_xyz', 's=', -'www_dregol_com', 'q=', -'search_socialdownloadr_com', 'q=', -'int_search_myway_com', 'searchfor=', -'de_dolphin_com', 'q=', -'mys_yoursearch_me', 'q=', -# Chello Portals -'chelloat','q1=', -'chellobe','q1=', -'chellocz','q1=', -'chellofr','q1=', -'chellohu','q1=', -'chellonl','q1=', -'chellono','q1=', -'chellopl','q1=', -'chellose','q1=', -'chellosk','q1=', -'chellocom','q1=', -# Mirago -'miragobe','(txtsearch|qry)=', -'miragoch','(txtsearch|qry)=', -'miragode','(txtsearch|qry)=', -'miragodk','(txtsearch|qry)=', -'miragoes','(txtsearch|qry)=', -'miragofr','(txtsearch|qry)=', -'miragoit','(txtsearch|qry)=', -'miragonl','(txtsearch|qry)=', -'miragono','(txtsearch|qry)=', -'miragose','(txtsearch|qry)=', -'miragocouk','(txtsearch|qry)=', -'mirago','(txtsearch|qry)=', -'answerbus','', # Does not provide query parameters -'icq','q=', -'nusearch','nusearch_terms=', -'goodsearch','Keywords=', -'scroogle','Gw=', # Does not always provide query parameters -'questionanswering','', -'mywebsearch','searchfor=', -'comettoolbar','qry=', -# Social Bookmarking Services -'delicious','all=', -'digg','s=', -'stumbleupon','', -'swik','swik\.net/', # does not work. Keywords follow domain, e.g. http://swik.net/awstats+analytics -'segnalo','', -'ineffabile','', -# Minor Australian search engines -'anzwers','search=', -# Minor brazilian search engines -'engine','p1=', 'miner','q=', -# Minor chinese search engines -'baidu','(wd|word)=', -'iask','(w|k)=', -'accoona','qt=', -'3721','(p|name)=', -'netease','q=', -'soso','q=', -'zhongsou','(word|w)=', -'sogou', 'query=', -'vnet','kw=', -# Minor czech search engines -'atlas','(searchtext|q)=', 'seznam','(w|q)=', 'quick','query=', 'centrum','q=', 'jyxo','(s|q)=', 'najdi','dotaz=', 'redbox','srch=', -'avgsearch', 'q=', -# Minor danish search engines -'opasia','q=', 'danielsen','q=', 'sol','q=', 'jubii','soegeord=', 'finddk','words=', 'edderkoppen','query=', 'orbis','search_field=', '1klik','query=', 'ofir','querytext=', -# Minor dutch search engines -'ilse','search_for=', 'vindex','in=', -# Minor english search engines -'askuk','(ask|q)=', 'bbc','q=', 'freeserve','q=', 'looksmartuk','key=', -'splut','pattern=', 'spotjockey','Search_Keyword=', 'ukindex', 'stext=', 'ukdirectory','k=', 'ukplus','search=', 'searchy', 'search_term=', -'fbdownloader','q=', -'fdownloadr_com', 'q=', -'babylon','q=', -'allgameshome', 's=', -'surfcanyon_com', 'q=', -'uk_foxstart_com', 'q=', -'yandex_com', 'text=', -# Minor finnish search engines -'haku','w=', -# Minor french search engines -'francite','name=', 'clubinternet', 'q=', -'toile', 'q=', -'biglotron','question=', -'mozbot','q=', -# Minor german search engines -'aolde','q=', -'o2aolde', 'q=', -'fireball','q=', 'infoseek','qt=', 'webde','su=', -'abacho','q=', 't-online','q=', -'metaspinner','qry=', -'metacrawler_de','qry=', -'wwweasel','q=', -'netluchs','query=', -'schoenerbrausen','q=', -'gmxsuche', 'q=', -'gmxsuche_at', 'q=', -'ecosiasearch', 'q=', -'aolsearch', 'q=', -'aolsuche', 'q=', -'startxxl', 'q=', -'benefind', 'q=', -'amazonsearch', 'query=', -'wowsearch', 'q=', -'vlips_de', 'q=', -'metager', 'eingabe=', -'search_1und1_de', 'q=', -'smde', 'q=', -#'sumaja', 'no query string available', #There is no query string in the referrer url -'navigationshilfe', 'q=', -'umfis', 'suchbegriff=', -'fastbot_de', 'red=[0-9]*\+', -'tixuma_de', 'sc=', -'freenet_de', 'query=', -'izito_de', 'q=', -'peoplecheck_de', 'q=', -'oneseek_de', 'q=', -'de_wiki_gov_cn', 'de\.wiki\.gov\.cn\/s_', -'umuwa_de', 'umuwa\.de\/', -'1und1_de', 'q=', -'metasuche_ch', 'q=', -# Minor Hungarian search engines -'heureka','heureka=', 'origo','(q|search)=', 'goliat','KERESES=', 'wahoo','q=', 'internetto','searchstr=', -'keresolap_hu','q=', -'startlap_hu', 'q=', -'tango_hu','q=', -'polymeta_hu','', -# Minor Indian search engines -'sify','keyword=', -# Minor Italian search engines -'virgilio','qs=', -'arianna','query=', -'supereva','q=', -'kataweb','q=', -'aliceitmaster','qs=', -'aliceit','qs=', -'gotuneed','', # Not yet known -'godado','Keywords=', -'jumpy\.it','searchWord=', -'shinyseek\.it','KEY=', -'teecnoit','q=', -# Minor Israeli search engines -'genieo','q=', -# Minor Japanese search engines -'askjp','(ask|q)=', -'sagool','q=', -'rakuten', 'qt=', -# Minor Norwegian search engines -'start','q=', 'eniro','q=', -# Minor Polish search engines -'wp','szukaj=', -'onetpl','qt=', -'dodajpl','keyword=', -'gazetapl','slowo=', -'gerypl','q=', -'hogapl','qt=', -'netsprintpl','q=', -'interiapl','q=', -'katalogonetpl','qt=', -'o2pl','qt=', -'polskapl','qt=', -'szukaczpl','q=', -'wowpl','q=', -# Minor russian search engines -'yandex', 'text=', 'rambler','words=', 'aport', 'r=', 'metabot', 'st=', -'mailru', 'q=', -# Minor swedish search engines -'passagen','q=', -'enirose', 'hitta:', #Not sure if this works, as the keywords are part of the URL, and therefore the URL does not contain a question mark. -# Minor swiss search engines -'searchch', 'q=', 'bluewin', 'qry=', -'zapmeta_ch', 'query=', -'etools_ch', 'query=', -# Minor Croatian, Serbian, Macedonian, Bosnian and Herzegovinian search engines -'pogodak', 'q=' -); - -# SearchEnginesKnownUrlNotFound -# Known rules to extract not found keywords from a referrer search engine URL -#------------------------------------------------------------------------------ -%SearchEnginesKnownUrlNotFound=( -# Most common search engines -'msn','origq=' -); - -# If no rules are known, we take first paramater not into WordsToCleanSearchUrl -#------------------------------------------------------------------------------ -@WordsToCleanSearchUrl= ('act=','annuaire=','btng=','cat=','categoria=','cfg=','cof=','cou=','count=','cp=','dd=','domain=','dt=','dw=','enc=','exec=','geo=','hc=','height=','hits=','hl=','hq=','hs=','id=','kl=','lang=','loc=','lr=','matchmode=','medor=','message=','meta=','mode=','order=','page=','par=','pays=','pg=','pos=','prg=','qc=','refer=','sa=','safe=','sc=','sort=','src=','start=','style=','stype=','sum=','tag=','temp=','theme=','type=','url=','user=','width=','what=','\\.x=','\\.y=','y=','look='); - -# SearchEnginesKnownUTFCoding -# Known parameter that proves a search engine has coded its parameters in UTF-8 -#------------------------------------------------------------------------------ -%SearchEnginesKnownUTFCoding=( -# Most common search engines -'google','ie=utf-8', -'alltheweb','cs=utf-8' -); - - -# SearchEnginesHashLib -# List of search engines names -# 'search_engine_id', 'search_engine_name', -#------------------------------------------------------------------------------ -%SearchEnginesHashLib=( -# Major international search engines -'alexa','Alexa', -'alltheweb','AllTheWeb', -'altavista','AltaVista', -'a9', 'A9', -'dmoz','DMOZ', -'google_products','Google (Products)', -'google_base','Google (Base)', -'google_froogle','Froogle (Google)', -'google_groups','Google (Groups)', -'google_image','Google (Images)', -'google_cache','Google (cache)', -'google','Google', -'lycos','Lycos', -'msn','Microsoft MSN Search', -'live','Microsoft Windows Live', -'bing','Microsoft Bing', -'netscape','Netscape', -'aol','AOL', -'terra','Terra', -'tiscali','Tiscali', -'voila','Voila', -'search.com','Search.com', -'yahoo_mindset','Yahoo! Mindset', -'yahoo','Yahoo!', -'sympatico','Sympatico', -'excite','Excite', -# Minor international search engines -'google4counter','4-counter (Google)', -'att','AT&T search (powered by Google)', -'bungeebonesdotcom','BungeeBones', -'go','Go.com', -'askde','Ask Deutschland', -'askes','Ask España', # break out Ask country specific engines. -'askfr','Ask France', -'askit','Ask Italia', -'asknl','Ask Nederland', -'ask','Ask', -'atomz','Atomz', -'dejanews','DejaNews', -'euroseek','Euroseek', -'findarticles','Find Articles', -'go2net','Go2Net (Metamoteur)', -'hotbot','Hotbot', -'infospace','InfoSpace', -'kvasir','Kvasir', -'looksmart','Looksmart', -'mamma','Mamma', -'metacrawler','MetaCrawler (Metamoteur)', -'nbci','NBCI', -'northernlight','NorthernLight', -'overture','Overture', # Replace 'goto\.com','Goto.com', -'dogpile','Dogpile', -'spray','Spray', -'teoma','Teoma', # Replace 'directhit\.com','DirectHit', -'webcrawler','WebCrawler', -'wisenut','WISENut', -'ixquick','ix quick', -'earthlink', 'Earth Link', -'iune','i-une', -'blingo','Blingo', -'centraldatabase','GPU p2p search', -'clusty','Clusty', -'mysearch','My Search', -'vivisimo','Vivisimo', -'kartoo','Kartoo', -'icerocket','Icerocket (Blog)', -'sphere','Sphere (Blog)', -'ledix','Ledix', -'shawca','Shaw.ca', -'searchalot','Searchalot', -'copernic','Copernic', -'avantfind','Avantfind', -'steadysearch','Avantfind', -'clarosearch','Claro Search', -'searchresults','Search-results', -'holasearch', 'Hola Search', -'conduit', 'Conduit Search', -'flipora', 'Flipora', -'delta-search', 'Delta Search', -'iminent', 'Iminent', -'searchmobileonline', 'Search Mobile Online (StartApp)', -'nortonsavesearch', 'Norton Safe Search', -'inbox', 'Inbox Search', -'govome', 'Govome', -'find1friend', 'Find1Friend', -'mysearchdial', 'My Search Dial', -'speedbit', 'Speedbit', -'certifiedtoolbarsearch', 'Certified-Toolbar Search', -'sweetim', 'SweetIM Search', -'searchcompletion', 'SearchCompletion Search', -'eazelsearch', 'Eazel Search', -'searchfunmoods', 'Funmoods', -'googleByIP', 'Google (Access by IP-Address)', -'dalesearch', 'Dale Search', -'sweetpacks', 'Sweetpacks', -'searchgol', 'Search-Gol', -'duckduckgo', 'DuckDuckGo (Does not provide search keyphrases, using found page instead)', -'facemoods', 'Facemoods Search', -'shoppstop', 'ShoppStop', -'searchya', 'Searchya', -'picsearch', 'picsearch', -'webssearches', 'Various variants of Webssearches EMG Technologies and airzip.inspsearch.com', -#Jan 8, 2016: No genuine inspsearch.com search engine seems so exist, but there is a couple of search engines using subdomains of inspsearch.com. Unclear how these are related to each other. -'zapmeta', 'ZapMeta', -'localmoxie', 'Local Moxie', -'search-results_mobi', 'search-results.mobi', -'androidsearch', 'androidsearch.com', -'isearch_nation_com', 'Nation Search', -'search_zonealarm_com', 'Zone Alarm Search', -'www_buenosearch_com', 'BuenoSearch', -'search_foxtab_com', 'Foxtab Search', -'searches_qone8_com', 'Omiga-Plus', -'startpage_com', 'Startpage', -'qwant_com', 'qwant.com', -'safehomepage_com', 'safehomepage.com', -'vi-view_com', 'vi-view.com', -'wow_utop_it', 'wow.utop.it', -'windowssearch_com', 'windowssearch.com', -'www_wow_com', 'WOW.com', -'globososo', 'Various variants of Globososo (Kingtale Technology): www, searches, searches3, and at inspsearch.com (globososo, kingtale3)', -'swisscows_ch', 'Swisscows', -'preciobarato_xyz', 'Yandex', -'www_dregol_com', 'Dregol Search', -'search_socialdownloadr_com', 'Socialdownloadr', -'int_search_myway_com', 'MyWay', -'de_dolphin_com', 'Dolphin Search', -'mys_yoursearch_me', 'Yoursearch.me', -# Chello Portals -'chelloat','Chello Austria', -'chellobe','Chello Belgium', -'chellocz','Chello Czech Republic', -'chellofr','Chello France', -'chellohu','Chello Hungary', -'chellonl','Chello Netherlands', -'chellono','Chello Norway', -'chellopl','Chello Poland', -'chellose','Chello Sweden', -'chellosk','Chello Slovakia', -'chellocom','Chello (Country not recognized)', -# Mirago -'miragobe','Mirago Belgium', -'miragoch','Mirago Switzerland', -'miragode','Mirago Germany', -'miragodk','Mirago Denmark', -'miragoes','Mirago Spain', -'miragofr','Mirago France', -'miragoit','Mirago Italy', -'miragonl','Mirago Netherlands', -'miragono','Mirago Norway', -'miragose','Mirago Sweden', -'miragocouk','Mirago UK', -'mirago','Mirago (country unknown)', -'answerbus','Answerbus', -'icq','icq', -'nusearch','Nusearch', -'goodsearch','GoodSearch', -'scroogle','Scroogle', -'questionanswering','Questionanswering', -'mywebsearch','MyWebSearch', -'comettoolbar','Comet toolbar search', -# Social Bookmarking Services -'delicious','del.icio.us (Social Bookmark)', -'digg','Digg (Social Bookmark)', -'stumbleupon','Stumbleupon (Social Bookmark)', -'swik','Swik (Social Bookmark)', -'segnalo','Segnalo (Social Bookmark)', -'ineffabile','Ineffabile.it (Social Bookmark)', -# Minor Australian search engines -'anzwers','anzwers.com.au', -# Minor brazilian search engines -'engine','Cade', 'miner','Meta Miner', -# Minor chinese search engines -'baidu','Baidu', -'iask','Iask', -'accoona','Accoona', -'3721','3721', -'netease', 'NetEase', -'soso','SoSo', -'zhongsou','ZhongSou', -'sogou', 'SoGou', -'vnet','VNet', -# Minor czech search engines -'atlas','Atlas.cz', 'seznam','Seznam', 'quick','Quick.cz', 'centrum','Centrum.cz', 'jyxo','Jyxo.cz', 'najdi','Najdi.to', 'redbox','RedBox.cz', -'avgsearch', 'AVG Secure Search', -# Minor danish search-engines -'opasia','Opasia', 'danielsen','Thor (danielsen.com)', 'sol','SOL', 'jubii','Jubii', 'finddk','Find', 'edderkoppen','Edderkoppen', 'netstjernen','Netstjernen', 'orbis','Orbis', 'tyfon','Tyfon', '1klik','1Klik', 'ofir','Ofir', -# Minor dutch search engines -'ilse','Ilse','vindex','Vindex\.nl', -# Minor english search engines -'askuk','Ask UK', -'bbc','BBC', 'freeserve','Freeserve', 'looksmartuk','Looksmart UK', -'splut','Splut', 'spotjockey','Spotjockey', 'ukdirectory','UK Directory', 'ukindex','UKIndex', 'ukplus','UK Plus', 'searchy','searchy.co.uk', -'fbdownloader','FBDownloader (fbdownloader)', -'fdownloadr_com', 'FBDownloader (fdownloadr)', -'babylon','Babylon', -'allgameshome', 'AllGamesHome', -'surfcanyon_com', 'SurfCanyon', -'uk_foxstart_com', 'Foxstart.com', -'yandex_com', 'Yandex', -# Minor finnish search engines -'haku','Ihmemaa', -# Minor french search engines -'aolfr','AOL (fr)', 'ctrouve','C\'est trouve', 'francite','Francite', 'lbb', 'LBB', 'libertysurf', 'Libertysurf', 'free', 'Free.fr', 'clubinternet', 'Club-internet', -'toile', 'Toile du Quebec', -'biglotron','Biglotron', -'mozbot','Mozbot', -# Minor German search engines -'aolde','AOL (de)', -'o2aolde', 'o2 Suche', -'fireball','Fireball', 'infoseek','Infoseek', -'webde','Web.de', -'abacho','Abacho', -'t-online','T-Online', -'allesklar','allesklar.de', 'meinestadt','meinestadt.de', -'metaspinner','metaspinner', -'metacrawler_de','metacrawler.de', -'wwweasel','WWWeasel', -'netluchs','Netluchs', -'schoenerbrausen','Schoenerbrausen/', -'gmxsuche', 'GMX Suche', -'gmxsuche_at', 'GMX Suche Oesterreich', -'ecosiasearch', 'Ecosia Search', -'aolsearch', 'AOL Search', -'aolsuche', 'AOL Suche', -'startxxl', 'StartXXL', -'benefind', 'benefind', -'amazonsearch', 'Amazon Web Search', -'wowsearch', 'Wow Search', -'vlips_de', 'vlips.de', -'metager', 'MetaGer', -'search_1und1_de', '1&1 Suche (subdomain "search")', -'smde', 'SM.de - Die SuchMaschine', -'sumaja', 'Sumaja', -'navigationshilfe', 'T-Online Navigationshilfe', -'umfis', 'UMFIS-Online Das Umweltfirmen-Informationssystem der IHKs in Deutschland', -'fastbot_de', 'Fastbot.de (Does not provide search keyphrases, using found page instead)', -'tixuma_de', 'Tixuma Deutschland', -'freenet_de', 'suche.freenet.de', -'izito_de', 'iZito Deutschland', -'peoplecheck_de', 'PeopleCheck.de', -'oneseek_de', 'Metasuchmaschine OneSeek.de', -'de_wiki_gov_cn', 'Wiki Sucher', -'umuwa_de', 'Umuwa Deutschland', -'1und1_de', '1&1 Suche (subdomain "suche")', -'metasuche_ch', 'Metasuche.ch', -# Minor hungarian search engines -'heureka','Heureka', 'origo','Origo-Vizsla', 'lapkereso','Startlapkereso', 'goliat','Goliat', 'indexhu','Index', 'wahoo','Wahoo', 'webmania','webmania.hu', 'internetto','Internetto Kereso', -'tango_hu','Tango', -'keresolap_hu','Tango keresolap', -'startlap_hu','Startlab Kereso', -'polymeta_hu','Polymeta', -# Minor Indian search engines -'sify','Sify', -# Minor Italian search engines -'virgilio','Virgilio', -'arianna','Arianna', -'supereva','Supereva', -'kataweb','Kataweb', -'aliceitmaster','search.alice.it.master', -'aliceit','alice.it', -'gotuneed','got u need', -'godado','Godado.it', -'jumpy\.it','Jumpy.it', -'shinyseek\.it','Shinyseek.it', -'teecnoit','Teecno', -# Minor Israeli search engines -'genieo','Genieo', -# Minor Japanese search engines -'askjp','Ask Japan', -'sagool','Sagool', -'rakuten', 'websearch.rakuten.co.jp', -# Minor Norwegian search engines -'start','start.no', 'eniro','Eniro', -# Minor polish search engines -'wp','Wirtualna Polska', -'onetpl','Onet.pl', -'dodajpl','Dodaj.pl', -'gazetapl','Gazeta.pl', -'gerypl','Gery.pl', -'hogapl','Hoga.pl', -'netsprintpl','NetSprint.pl', -'interiapl','Interia.pl', -'katalogonetpl','Katalog.Onet.pl', -'o2pl','o2.pl', -'polskapl','Polska', -'szukaczpl','Szukacz', -'wowpl','Wow.pl', -# Minor russian search engines -'yandex', 'Yandex', 'aport', 'Aport', 'rambler', 'Rambler', 'turtle', 'Turtle', 'metabot', 'MetaBot', -'mailru','Mail.Ru', -# Minor Swedish search engines -'passagen','Evreka', -'enirose','Eniro Sverige', -# Minor Slovak search engines -'zoznam','Zoznam', -# Minor Portuguese search engines -'sapo','Sapo', -# Minor Swiss search engines -'searchch', 'search.ch', 'bluewin', 'search.bluewin.ch', -'zapmeta_ch', 'ZapMeta.ch', -'etools_ch', 'eTools.ch', -# Minor Croatian, Serbian, Macedonian, Bosnian and Herzegovinian search engines -'pogodak','Pogodak.com', -# Generic search engines -'search','Unknown search engines' -); - - -# Sanity check. -# Enable this code and run perl search_engines.pm to check file entries are ok -#----------------------------------------------------------------------------- -#foreach my $key (@SearchEnginesSearchIDOrder_list1) { -# if (! $SearchEnginesHashID{$key}) { error("Entry '$key' has been found in SearchEnginesSearchIDOrder_list1 with no value in SearchEnginesHashID"); -# foreach my $key2 (@SearchEnginesSearchIDOrder_list2) { if ($key2 eq $key) { error("$key is in 1 and 2\n"); } } -# foreach my $key2 (@SearchEnginesSearchIDOrder_listgen) { if ($key2 eq $key) { error("$key is in 1 and gen\n"); } } -#} } -#foreach my $key (@SearchEnginesSearchIDOrder_list2) { -# if (! $SearchEnginesHashID{$key}) { error("Entry '$key' has been found in SearchEnginesSearchIDOrder_list1 with no value in SearchEnginesHashID"); -# foreach my $key2 (@SearchEnginesSearchIDOrder_list1) { if ($key2 eq $key) { error("$key is in 2 and 1\n"); } } -# foreach my $key2 (@SearchEnginesSearchIDOrder_listgen) { if ($key2 eq $key) { error("$key is in 2 and gen\n"); } } -#} } -#foreach my $key (@SearchEnginesSearchIDOrder_listgen) { if (! $SearchEnginesHashID{$key}) { error("Entry '$key' has been found in SearchEnginesSearchIDOrder_listgen with no value in SearchEnginesHashID"); } } -#foreach my $key (keys %NotSearchEnginesKeys) { if (! $SearchEnginesHashID{$key}) { error("Entry '$key' has been found in NotSearchEnginesKeys with no value in SearchEnginesHashID"); } } -#foreach my $key (keys %SearchEnginesKnownUrl) { -# my $found=0; -# foreach my $key2 (values %SearchEnginesHashID) { -# if ($key eq $key2) { $found=1; last; } -# } -# if (! $found) { die "Entry '$key' has been found in SearchEnginesKnownUrl with no value in SearchEnginesHashID"; } -#} -#foreach my $key (keys %SearchEnginesHashLib) { -# my $found=0; -# foreach my $key2 (values %SearchEnginesHashID) { -# if ($key eq $key2) { $found=1; last; } -# } -# if (! $found) { die "Entry '$key' has been found in SearchEnginesHashLib with no value in SearchEnginesHashID"; } -#} -#print @SearchEnginesSearchIDOrder_list1." ".@SearchEnginesSearchIDOrder_list2." ".@SearchEnginesSearchIDOrder_listgen; - -1; +# AWSTATS SEARCH ENGINES DATABASE +#------------------------------------------------------------------------------ +# If you want to add a Search Engine to extend AWStats database detection capabilities, +# you must add an entry in SearchEnginesSearchIDOrder, SearchEnginesHashID and in +# SearchEnginesHashLib. +# An entry if known in SearchEnginesKnownUrl is also welcome. +# +# to eldy: Please check if the following description is correct: +# You need the following information to specify a search engine: +# (a) A regular expression that matches the referrer string of the +# search engine. Unclear: What about slashes in the name of +# a search engine, e.g. as in 'ecosia.com/search'. Seems that +# AWStats will non find search strings containing a slash. +# Maybe use a search string without a slash, and - if necessary - +# an entry in %NotSearchEnginesKeys , if this search string +# matches entries that are not search engines. +# Example of a web address of a Amazon search engine: +# http://www.amazon.de/gp/bit/apps/web/SERP/search/ref=bit_bds-p24_serp_cr_de?ie=UTF8tagbase=bds-p24&query=deutsch+8.+klasse+gymnasium+protokoll +# (b) A unique string to identify the search engine within AWStats +# (c) A regular expression that finds the start of the query part in the +# referrer string +# (d) A HTML-fragment that goes into the reports generated by AWStats which +# identifies the search engine to human reader of the report. In the +# simplest case this is a string containing the name of the search +# engine. You can also provide a hypertext clause that presents the +# name together with a link to the search engine. +# +# The regular expression (a) goes into SearchEnginesSearchIDOrder_list1 +# or ..._list2. List 1 contains common search engines, list 2 those +# that are not so often used. +# +# SearchEnginesHashID contains to consecutive entries for each search +# engine: The regular expression (a) followed bei the search engine +# identifier (b) +# +# SearchEnginesKnownUrl specifies how to find the start of the query. +# For each search engine you enter the search engine identifier (b) +# followed by the regular expression (c). Unclear: It is possible to +# omit this entry. If you do this, how will AWStats find the start of +# the query? +# +# SearchEnginesHashLib contains also two entries for each search engine: +# The search engine identifier (b) followed by the HTML-Fragment (d) +# +# There are search engines that do not use a query part in their URLs. +# They put the search expression in the main part of the URL instead. +# AWStats is able to handle these cases. They are specified as described +# above, except the following two things: +# - The regular expression (c) searches the complete URL and not only +# the query part. +# - An additional Entry in the list %SearchEnginesWithKeysNotInQuery is +# necessary. +# +# +# AWStats runs a sanity check of the contents of search_engines.pm. This +# check detects the following things: +# - Inconsistencies (number of entries) +# It does not detect the following errors: +# - If the HTML-Fragment (d) is syntactically incorrect. +# +#------------------------------------------------------------------------------ + +# 2005-08-19 Sean Carlos http://www.antezeta.com/awstats.html +# added minor italian search engines +# arianna http://arianna.libero.it/ +# supereva http://search.supereva.com/ +# kataweb http://kataweb.it/ +# corrected uk looksmart +# 'askuk','ask=', 'bbc','q=', 'freeserve','q=', 'looksmart','key=', +# to +# 'askuk','ask=', 'bbc','q=', 'freeserve','q=', 'looksmartuk','key=', +# corrected spelling +# internationnal -> international +# added 'google\.'=>'mail\.google\.', to NotSearchEnginesKeys in order to +# avoid counting gmail referrals as search engine traffic +# 2005-08-21 Sean Carlos http://www.antezeta.com/awstats.html +# avoid counting babelfish.altavista referrals as search engine traffic +# avoid counting translate.google referrals as search engine traffic +# 2005-11-20 Sean Carlos +# added missing 'tiscali','key=', entry. Check order +# 2005-11-22 Sean Carlos +# added Google Base & Froogle. Froogle not tested. +# 2006-04-18 Sean Carlos http://www.antezeta.com/awstats.html +# added biglotron.com (France) +# added blingo http://www.blingo.com/ +# added Clusty & Vivisimo +# added eniro.no (Norway) [https://sourceforge.net/forum/message.php?msg_id=3134783] +# added GPU p2p search http://search.centraldatabase.org/ +# added mail.tiscali to "not search engines list" [https://sourceforge.net/forum/message.php?msg_id=3166688] +# added Ask group's "mysearch" +# added sify.com (India) +# added sogou.com (Cina) [https://sourceforge.net/forum/message.php?msg_id=3501603] +# Ask changes: +# - added Ask Japan (ask.jp) +# - break out Ask new country level variants (DE, ES, FR, IT, NL) +# - updated Ask name from Ask Jevees +# - added Ask q= parameter - many recent searches probably not recognized; [https://sourceforge.net/forum/message.php?msg_id=3465444] +# - updated Ask uk (new uk.ask.com added to older ask.co.uk) +# updated voila kw|rdata parameter [https://sourceforge.net/forum/message.php?msg_id=3373912] +# for each new engine, added link to Search Engine. This serves to document engine. Done for major & Italian engines as well. Requires patch +# to AWStats to allow untranslated html. Otherwise html will appear instead of link. +# reviewed mnoGoSearch (http://www.mnogosearch.org/); the search engined mentioned no longer +# exists https://sourceforge.net/forum/message.php?msg_id=3025426 +# 2006-05-13 Sean Carlos http://www.antezeta.com/awstats.html +# added 10 Chello European broadband portals (Austria, Belgium, Czech Republic, France, Hungary, The Netherlands, Norway, Poland, Slovakia, Sweden) +# added Alice Internal Search (blends data with Google?) search.alice.it.master:10005 +# added detection of google cache views from IPs 66.249.93.104 72.14.203.104 72.14.207.104 +# To do: add more extensive IP list; keywords not yet detected. +# added icerocket.com blog search http://www.icerocket.com/ +# added live.com (msn) http://www.live.com/ +# added Meta motor kartoo. Note: Kartoo does not provide search words in referrers, thus the engine will appear in the +# search engine list but the actual search words are not available. +# added netluchs.de http://www.netluchs.de/ +# added sphere.com blog search http://www.sphere.com/ +# added wwweasel.de http://wwweasel.de +# added Yahoo Mindset! http://mindset.research.yahoo.com/ +# updated Mirago query parameter recognition (qry=); added breakout for each country (France, Germany, Spain, Italy, Norway, Sweden, Denmark, Netherlands, Belgium, Switzerland) +# 2006-05-13 Sean Carlos http://www.antezeta.com/awstats.html +# added Google cache IPs 64.233.183.104 & 66.102.7.104 +# 2006-05-20 Sean Carlos http://www.antezeta.com/awstats.html +# anzwers.com.au +# schoenerbrausen.de http://www.schoenerbrausen.de/ +# added Google cache IP 216.239.59.104 +# answerbus http://www.answerbus.com/ (does not provide keywords) +# 2006-05-23 Sean Carlos http://www.antezeta.com/awstats.html +# added Google cache IP 66.102.9.104, 64.233.161.104 +# 2006-06-23 Sean Carlos http://www.antezeta.com/awstats.html +# added Alice Search search.alice.it +# added GoodSearch http://www.goodsearch.com/ (does not provide keywords) "a Yahoo-powered search engine that donates money to your favorite charity or school each time you search the web" +# added googlee.com, variant of Google +# added gotuneed http://www.gotuneed.com/ Italian search engine, in beta +# added icq.com +# added logic to parse Google Cache search keywords. Seems to work for alpha but not numeric cache IDs, i.e. search?q=cache:lWVLmnuGJswJ: is recognized but q=cache:Yv5qxeJNuhgJ: is not recognized. The URL triggering the keywords will also appear. The URLs are probably too varied to parse out? +# added Nusearch http://www.nusearch.com/ +# added Polymeta www.polymeta.hu (does not provide keywords) +# added scroogle http://www.scroogle.org/ (does not always provide keywords) +# added Tango http://tango.hu/search.php?st=0&q=jeles+napok +# Changed Google Cache notation 64\.233\.(161|167|179|183|187)\.104 to 64\.233\.1[0-9]{2}\.104 +# 72\.14\.(203|205|207|209|221)\.104 to 72\.14\.2[0-9]{2}\.104 +# 216\.239\.(51|59)\.104 to 216\.239\.5[0-9]\.104 +# 66\.102\.(7|9)\.104 to 66\.102\.[1-9]\.104 +# 2006-06-27 Sean Carlos http://www.antezeta.com/awstats.html +# added Onet.pl http://szukaj.onet.pl/ +# corrected name "Wirtualna Polska" from "Szukaj" (search); added link http://szukaj.wp.pl/ +# 2006-06-30 Sean Carlos http://www.antezeta.com/awstats.html +# Additional Polish Search Engines: +# added Dodaj.pl http://www.dodaj.pl/ +# added Gazeta.pl http://szukaj.gazeta.pl/ +# added Gery.pl http://szukaj.gery.pl/ +# added Hoga.pl http://www.hoga.pl/ +# added Interia.pl http://www.google.interia.pl/ +# added Katalog.Onet.pl http://katalog.onet.pl/ +# added NetSprint.pl http://www.netsprint.pl/ +# added o2.pl http://szukaj2.o2.pl/ +# added Polska http://szukaj.polska.pl/ +# added Szukacz http://www.szukacz.pl/ +# added Wow.pl http://szukaj.wow.pl/ +# added Sagool http://sagool.jp/ + +# 2006-08-25 Social Bookmarks +# International +# added del.icio.us/search - for now, just search referrer. To do: consider /tag/(tagname) referrer? +# added stumbleupon.com - No keywords supplied. +# added swik.net +# added digg. Keywords sometimes supplied. +# Italy +# added segnalo.alice.it - No keywords supplied. +# added ineffabile.it - No keywords supplied. + +# added filter for google groups. Attempt to parse group name as keyword. + +# 2006-09-14 +# added Eniro Sverige http://www.eniro.se/ +# added MyWebSearch http://search.mywebsearch.com/ +# added Teecno http://www.teecno.it/ Italian Open Source Search Engine + +#package AWSSE; + +# 2006-09-25 (Gabor Moizes) +# added 4-counter (Google alternative) http://4-counter.com/ +# added Googlecom (Google alternative) http://googlecom.com/ +# added Goggle (Google alternative) http://goggle.co.hu/ +# added Comet toolbar http://as.starware.com +# added new IP for Yahoo: 216.109.125.130 +# added Ledix http://ledix.net/ +# added AT&T search (powered by Google) http://www.att.net/ +# added Keresolap (Hungarian search engine) http://www.keresolap.hu/ +# added Mozbot (French search engine) http://www.mozbot.fr/ +# added Zoznam (Slovak search engine) http://www.zoznam.sk/ +# added sapo.pt (Portuguese search engine) http://www.sapo.pt/ +# added shaw.ca (powered by Google) http://start.shaw.ca/ +# added Searchalot http://www.searchalot.com/ +# added Copernic http://www.copernic.com/ +# added 216.109.125.130 to Yahoo +# added 66.218.69.11 to Yahoo +# added Avantfind http://www.avantfind.com/ +# added Steadysearch http://www.steadysearch.com/ +# added Steadysearch http://www.steady-search.com/ +# modified 216\.239\.5[0-9]\.104/search to 216\.239\.5[0-9]\.104 + + +# SearchEnginesSearchIDOrder +# It contains all matching criteria to search for in log fields. This list is +# used to know in which order to search Search Engines IDs. +# Most frequent one are in list1, used when LevelForSearchEnginesDetection is 1 or more +# Minor robots are in list2, used when LevelForSearchEnginesDetection is 2 or more +# Note: Regex IDs are in lower case and ' ' and '+' are changed into '_' +#------------------------------------------------------------------------------ +@SearchEnginesSearchIDOrder_list1=( +# Major international search engines +'google\.[\w.]+/products', +'base\.google\.', +'froogle\.google\.', +'groups\.google\.', +'images\.google\.', +'google\.', +'googlee\.', +'googlecom\.com', +'goggle\.co\.hu', +'216\.239\.32\.20', +'173\.194\.32\.223', +'216\.239\.(35|37|39|51)\.100', +'216\.239\.(35|37|39|51)\.101', +'216\.239\.5[0-9]\.104', +'64\.233\.1[0-9]{2}\.104', +'66\.102\.[1-9]\.104', +'66\.249\.93\.104', +'72\.14\.2[0-9]{2}\.104', +'msn\.', +'live\.com', +'bing\.', +'voila\.', +'mindset\.research\.yahoo', +'yahoo\.','(66\.218\.71\.225|216\.109\.117\.135|216\.109\.125\.130|66\.218\.69\.11)', +'search\.aol\.co', +'tiscali\.', +'lycos\.', +'alexa\.com', +'alltheweb\.com', +'altavista\.', +'a9\.com', +'dmoz\.org', +'netscape\.', +'search\.terra\.', +'www\.search\.com', +'search\.sli\.sympatico\.ca', +'excite\.' +); + +@SearchEnginesSearchIDOrder_list2=( +# Minor international search engines +'4\-counter\.com', +'att\.net', +'bungeebonesdotcom', +'northernlight\.', +'hotbot\.', +'kvasir\.', +'webcrawler\.', +'metacrawler\.', +'go2net\.com', +'(^|\.)go\.com', +'euroseek\.', +'looksmart\.', +'spray\.', +'nbci\.com\/search', +'de\.ask.\com', # break out Ask country specific engines. (.jp is in Japan section) +'es\.ask.\com', +'fr\.ask.\com', +'it\.ask.\com', +'nl\.ask.\com', +'uk\.ask.\com', +'(^|\.)ask\.com', +'atomz\.', +'overture\.com', # Replace 'goto\.com','Goto.com', +'teoma\.', +'findarticles\.com', +'infospace\.com', +'mamma\.', +'dejanews\.', +'dogpile\.com', +'wisenut\.com', +'ixquick\.com', +'search\.earthlink\.net', +'i-une\.com', +'blingo\.com', +'centraldatabase\.org', +'clusty\.com', +'mysearch\.', +'vivisimo\.com', +'kartoo\.com', +'icerocket\.com', +'sphere\.com', +'ledix\.net', +'start\.shaw\.ca', +'searchalot\.com', +'copernic\.com', +'avantfind\.com', +'steadysearch\.com', +'steady-search\.com', +'claro-search\.com', +'www1\.search-results\.com', +'www\.holasearch\.com', +'search\.conduit\.com', +'static\.flipora\.com', +'(?:www[12]?|mixidj)\.delta-search\.com', +'start\.iminent\.com', +'www\.searchmobileonline\.com', +'int\.search-results\.com', +'www2\.inbox\.com', +'www\.govome\.com', +'find1friend\.com', +'start\.mysearchdial\.com', +'go\.speedbit\.com', +'search\.certified-toolbar\.com', +'search\.sweetim\.com', +'search\.searchcompletion\.com', +'en\.eazel\.com', +'sr\.searchfunmoods\.com', +'173\.194\.35\.177', +'dalesearch\.com', +'sweetpacks-search\.com', +'searchgol\.com', +'duckduckgo\.com', +'sr\.facemoods\.com', +'shoppstop\.com', +'searchya\.com', +'picsearch\.de', +'webssearches\.com', +'airzip\.inspsearch\.com', +'zapmeta\.de', +'localmoxie\.com', +'search-results\.mobi', +'androidsearch\.com', +'isearch\.nation\.com', +'search\.zonealarm\.com', +'www\.buenosearch\.com', +'search\.foxtab\.com', +'searches\.qone8\.com', +'startpage\.com', +'www\.qwant\.com', +'searches\.safehomepage\.com', +'searches\.vi-view\.com', +'wow\.utop\.it', +'windowssearch\.com', +'www\.wow\.com', +'globososo\.', +'kingtale3\.inspsearch\.com', +'swisscows\.ch', +'preciobarato\.xyz', +'www\.dregol\.com', +'search\.socialdownloadr\.com', +'int\.search\.myway\.com', +'de\.dolphin\.com', +'mys\.yoursearch\.me', +# Chello Portals +'chello\.at', +'chello\.be', +'chello\.cz', +'chello\.fr', +'chello\.hu', +'chello\.nl', +'chello\.no', +'chello\.pl', +'chello\.se', +'chello\.sk', +'chello', # required as catchall for new countries not yet known +# Mirago +'mirago\.be', +'mirago\.ch', +'mirago\.de', +'mirago\.dk', +'es\.mirago\.com', +'mirago\.fr', +'mirago\.it', +'mirago\.nl', +'no\.mirago\.com', +'mirago\.se', +'mirago\.co\.uk', +'mirago', # required as catchall for new countries not yet known +'answerbus\.com', +'icq\.com\/search', +'nusearch\.com', +'goodsearch\.com', +'scroogle\.org', +'questionanswering\.com', +'mywebsearch\.com', +'as\.starware\.com', +# Social Bookmarking Services +'del\.icio\.us', +'digg\.com', +'stumbleupon\.com', +'swik\.net', +'segnalo\.alice\.it', +'ineffabile\.it', +# Minor Australian search engines +'anzwers\.com\.au', +# Minor brazilian search engines +'engine\.exe', 'miner\.bol\.com\.br', +# Minor chinese search engines +'\.baidu\.com', # baidu search portal +'\.vnet\.cn', # powered by MSN +'\.soso\.com', # powered by Google +'\.sogou\.com', # powered by Sohu +'\.3721\.com', # powered by Yahoo! +'iask\.com', # powered by Sina +'\.accoona\.com', # Accoona +'\.163\.com', # powered by Google +'\.zhongsou\.com', # zhongsou search portal +# Minor czech search engines +'atlas\.cz','seznam\.cz','quick\.cz','centrum\.cz','jyxo\.(cz|com)','najdi\.to','redbox\.cz', +'isearch\.avg\.com', +# Minor danish search-engines +'opasia\.dk', 'danielsen\.com', 'sol\.dk', 'jubii\.dk', 'find\.dk', 'edderkoppen\.dk', 'netstjernen\.dk', 'orbis\.dk', 'tyfon\.dk', '1klik\.dk', 'ofir\.dk', +# Minor dutch search engines +'ilse\.','vindex\.', +# Minor english search engines +'(^|\.)ask\.co\.uk','bbc\.co\.uk/cgi-bin/search','ifind\.freeserve','looksmart\.co\.uk','splut\.','spotjockey\.','ukdirectory\.','ukindex\.co\.uk','ukplus\.','searchy\.co\.uk', +'search\.fbdownloader\.com', +'search\.fdownloadr\.com', +'search\.babylon\.com', +'my\.allgameshome\.com', +'surfcanyon\.com', +'uk\.foxstart\.com', +'yandex\.com', +# Minor finnish search engines +'haku\.www\.fi', +# Minor french search engines +'recherche\.aol\.fr','ctrouve\.','francite\.','\.lbb\.org','rechercher\.libertysurf\.fr', 'search[\w\-]+\.free\.fr', 'recherche\.club-internet\.fr', +'toile\.com', 'biglotron\.com', +'mozbot\.fr', +# Minor german search engines +'sucheaol\.aol\.de', +'o2suche\.aol\.de', +'fireball\.de','infoseek\.de','suche\d?\.web\.de','[a-z]serv\.rrzn\.uni-hannover\.de', +'suchen\.abacho\.de','(brisbane|suche)\.t-online\.de','allesklar\.de','meinestadt\.de', +'212\.227\.33\.241', +'(161\.58\.227\.204|161\.58\.247\.101|212\.40\.165\.90|213\.133\.108\.202|217\.160\.108\.151|217\.160\.111\.99|217\.160\.131\.108|217\.160\.142\.227|217\.160\.176\.42)', +'wwweasel\.de', +'netluchs\.de', +'schoenerbrausen\.de', +'suche\.gmx\.net', +'suche\.gmx\.at', +'ecosia\.org', +'de\.aolsearch\.com', +'suche\.aol\.de', +'www\.startxxl\.com', +'www\.benefind\.de', +'www\.amazon\.de.*search', #Just as a reminder, probably will not work as AWstats seems to consider the host part of an URL only +'de\.wow\.com', +'www\.vlips\.de', +'metager\.de', +'search\.1und1\.de', +'sm\.de', +'sumaja\.de', +'navigationshilfe\.t-online\.de', +'umfis\.de', +'fastbot\.de', +'tixuma\.de', +'suche\.freenet\.de', +'www\.izito\.de', +'extern\.peoplecheck\.de', +'www\.oneseek\.de', +'de\.wiki\.gov\.cn', +'umuwa\.de', +'suche\.1und1\.de', +'www\.metasuche\.ch', +# Minor Hungarian search engines +'heureka\.hu','vizsla\.origo\.hu','lapkereso\.hu','goliat\.hu','index\.hu','wahoo\.hu','webmania\.hu','search\.internetto\.hu', +'tango\.hu', +'keresolap\.hu', +'kereso\.startlap\.hu', +'polymeta\.hu', +# Minor Indian search engines +'sify\.com', +# Minor Italian search engines +'virgilio\.it','arianna\.libero\.it','supereva\.com','kataweb\.it','search\.alice\.it\.master','search\.alice\.it','gotuneed\.com', +'godado','jumpy\.it','shinyseek\.it','teecno\.it', +# Minor Israeli search engines +'search\.genieo\.com', +# Minor Japanese search engines +'ask\.jp','sagool\.jp', +'websearch\.rakuten\.co\.jp', +# Minor Norwegian search engines +'sok\.start\.no', 'eniro\.no', +# Minor Polish search engines +'szukaj\.wp\.pl','szukaj\.onet\.pl','dodaj\.pl','gazeta\.pl','gery\.pl','hoga\.pl','netsprint\.pl','interia\.pl','katalog\.onet\.pl','o2\.pl','polska\.pl','szukacz\.pl','wow\.pl', +# Minor russian search engines +'ya(ndex)?\.ru', 'aport\.ru', 'rambler\.ru', 'turtle\.ru', 'metabot\.ru', +'go\.mail\.ru', +# Minor Swedish search engines +'evreka\.passagen\.se','eniro\.se', +# Minor Slovak search engines +'zoznam\.sk', +# Minor Portuguese search engines +'sapo\.pt', +# Minor swiss search engines +'search\.ch', 'search\.bluewin\.ch', +'www\.zapmeta\.ch', +'etools\.ch', +# Minor Croatian, Serbian, Macedonian, Bosnian and Herzegovinian search engines +'pogodak\.' +); +@SearchEnginesSearchIDOrder_listgen=( +# Generic search engines +'search\..*\.\w+' +); + + +# NotSearchEnginesKeys +# If a search engine key is found, we check its exclude list to know if it's +# really a search engine +#------------------------------------------------------------------------------ +%NotSearchEnginesKeys=( +'altavista\.'=>'babelfish\.altavista\.', +'google\.'=>'mail\.google\.', +'google\.'=>'translate\.google\.', +'google\.'=>'code\.google\.', +'google\.'=>'groups\.google\.', +'msn\.'=>'hotmail\.msn\.', +'tiscali\.'=>'mail\.tiscali\.', +'yahoo\.'=>'(?:picks|mail)\.yahoo\.|yahoo\.[^/]+/picks', +'yandex\.'=>'direct\.yandex\.' +); + + +# SearchEnginesHashID +# Each Search Engine Search ID is associated to an AWStats id string +#------------------------------------------------------------------------------ +%SearchEnginesHashID = ( +# Major international search engines +'google\.[\w.]+/products','google_products', +'base\.google\.','google_base', +'froogle\.google\.','google_froogle', +'groups\.google\.','google_groups', +'images\.google\.','google_image', +'google\.','google', +'googlee\.','google', +'googlecom\.com','google', +'goggle\.co\.hu','google', +'216\.239\.32\.20', 'google', +'173\.194\.32\.223', 'google', +'216\.239\.(35|37|39|51)\.100','google_cache', +'216\.239\.(35|37|39|51)\.101','google_cache', +'216\.239\.5[0-9]\.104','google_cache', +'64\.233\.1[0-9]{2}\.104','google_cache', +'66\.102\.[1-9]\.104','google_cache', +'66\.249\.93\.104','google_cache', +'72\.14\.2[0-9]{2}\.104','google_cache', +'msn\.','msn', +'live\.com','live', +'bing\.','bing', +'voila\.','voila', +'mindset\.research\.yahoo','yahoo_mindset', +'yahoo\.','yahoo','(66\.218\.71\.225|216\.109\.117\.135|216\.109\.125\.130|66\.218\.69\.11)','yahoo', +'lycos\.','lycos', +'alexa\.com','alexa', +'alltheweb\.com','alltheweb', +'altavista\.','altavista', +'a9\.com','a9', +'dmoz\.org','dmoz', +'netscape\.','netscape', +'search\.terra\.','terra', +'www\.search\.com','search.com', +'tiscali\.','tiscali', +'search\.aol\.co','aol', +'search\.sli\.sympatico\.ca','sympatico', +'excite\.','excite', +# Minor international search engines +'4\-counter\.com','google4counter', +'att\.net','att', +'bungeebonesdotcom','bungeebonesdotcom', +'northernlight\.','northernlight', +'hotbot\.','hotbot', +'kvasir\.','kvasir', +'webcrawler\.','webcrawler', +'metacrawler\.','metacrawler', +'go2net\.com','go2net', +'(^|\.)go\.com','go', +'euroseek\.','euroseek', +'looksmart\.','looksmart', +'spray\.','spray', +'nbci\.com\/search','nbci', +'de\.ask.\com','askde', # break out Ask country specific engines. +'es\.ask.\com','askes', +'fr\.ask.\com','askfr', +'it\.ask.\com','askit', +'nl\.ask.\com','asknl', +'uk\.ask.\com','askuk', +'(^|\.)ask\.co\.uk','askuk', +'(^|\.)ask\.com','ask', +'atomz\.','atomz', +'overture\.com','overture', # Replace 'goto\.com','Goto.com', +'teoma\.','teoma', +'findarticles\.com','findarticles', +'infospace\.com','infospace', +'mamma\.','mamma', +'dejanews\.','dejanews', +'dogpile\.com','dogpile', +'wisenut\.com','wisenut', +'ixquick\.com','ixquick', +'search\.earthlink\.net','earthlink', +'i-une\.com','iune', +'blingo\.com','blingo', +'centraldatabase\.org','centraldatabase', +'clusty\.com','clusty', +'mysearch\.','mysearch', +'vivisimo\.com','vivisimo', +'kartoo\.com','kartoo', +'icerocket\.com','icerocket', +'sphere\.com','sphere', +'ledix\.net','ledix', +'start\.shaw\.ca','shawca', +'searchalot\.com','searchalot', +'copernic\.com','copernic', +'avantfind\.com','avantfind', +'steadysearch\.com','steadysearch', +'steady-search\.com','steadysearch', +'claro-search\.com','clarosearch', +'www1\.search-results\.com', 'searchresults', +'www\.holasearch\.com', 'holasearch', +'search\.conduit\.com', 'conduit', +'static\.flipora\.com', 'flipora', +'(?:www[12]?|mixidj)\.delta-search\.com', 'delta-search', +'start\.iminent\.com', 'iminent', +'www\.searchmobileonline\.com', 'searchmobileonline', +'int\.search-results\.com', 'nortonsavesearch', +'www2\.inbox\.com', 'inbox', +'www\.govome\.com', 'govome', +'find1friend\.com', 'find1friend', +'start\.mysearchdial\.com', 'mysearchdial', +'go\.speedbit\.com', 'speedbit', +'search\.certified-toolbar\.com', 'certifiedtoolbarsearch', +'search\.sweetim\.com', 'sweetim', +'search\.searchcompletion\.com', 'searchcompletion', +'en\.eazel\.com','eazelsearch', +'sr\.searchfunmoods\.com', 'searchfunmoods', +'173\.194\.35\.177', 'googleByIP', +'dalesearch\.com', 'dalesearch', +'sweetpacks-search\.com', 'sweetpacks', +'searchgol\.com', 'searchgol', +'duckduckgo\.com', 'duckduckgo', +'sr\.facemoods\.com', 'facemoods', +'shoppstop\.com', 'shoppstop', +'searchya\.com', 'searchya', +'picsearch\.de', 'picsearch', +'webssearches\.com', 'webssearches', +'airzip\.inspsearch\.com', 'webssearches', +'zapmeta\.de', 'zapmeta', +'localmoxie\.com', 'localmoxie', +'search-results\.mobi', 'search-results_mobi', +'androidsearch\.com', 'androidsearch', +'isearch\.nation\.com', 'isearch_nation_com', +'search\.zonealarm\.com', 'search_zonealarm_com', +'www\.buenosearch\.com', 'www_buenosearch_com', +'search\.foxtab\.com', 'search_foxtab_com', +'searches\.qone8\.com', 'searches_qone8_com', +'startpage\.com', 'startpage_com', +'www\.qwant\.com', 'qwant_com', +'searches\.safehomepage\.com', 'safehomepage_com', +'searches\.vi-view\.com', 'vi-view_com', +'wow\.utop\.it', 'wow_utop_it', +'windowssearch\.com', 'windowssearch_com', +'www\.wow\.com', 'www_wow_com', +'globososo\.', 'globososo', +'kingtale3\.inspsearch\.com', 'globososo', +'swisscows\.ch', 'swisscows_ch', +'preciobarato\.xyz', 'preciobarato_xyz', +'www\.dregol\.com', 'www_dregol_com', +'search\.socialdownloadr\.com', 'search_socialdownloadr_com', +'int\.search\.myway\.com', 'int_search_myway_com', +'de\.dolphin\.com', 'de_dolphin_com', +'mys\.yoursearch\.me', 'mys_yoursearch_me', +# Chello Portals +'chello\.at','chelloat', +'chello\.be','chellobe', +'chello\.cz','chellocz', +'chello\.fr','chellofr', +'chello\.hu','chellohu', +'chello\.nl','chellonl', +'chello\.no','chellono', +'chello\.pl','chellopl', +'chello\.se','chellose', +'chello\.sk','chellosk', +'chello','chellocom', +# Mirago +'mirago\.be','miragobe', +'mirago\.ch','miragoch', +'mirago\.de','miragode', +'mirago\.dk','miragodk', +'es\.mirago\.com','miragoes', +'mirago\.fr','miragofr', +'mirago\.it','miragoit', +'mirago\.nl','miragonl', +'no\.mirago\.com','miragono', +'mirago\.se','miragose', +'mirago\.co\.uk','miragocouk', +'mirago','mirago', # required as catchall for new countries not yet known +'answerbus\.com','answerbus', +'icq\.com\/search','icq', +'nusearch\.com','nusearch', +'goodsearch\.com','goodsearch', +'scroogle\.org','scroogle', +'questionanswering\.com','questionanswering', +'mywebsearch\.com','mywebsearch', +'as\.starware\.com','comettoolbar', +# Social Bookmarking Services +'del\.icio\.us','delicious', +'digg\.com','digg', +'stumbleupon\.com','stumbleupon', +'swik\.net','swik', +'segnalo\.alice\.it','segnalo', +'ineffabile\.it','ineffabile', +# Minor Australian search engines +'anzwers\.com\.au','anzwers', +# Minor brazilian search engines +'engine\.exe','engine', +'miner\.bol\.com\.br','miner', +# Minor chinese search engines +'\.baidu\.com','baidu', +'iask\.com','iask', +'\.accoona\.com','accoona', +'\.3721\.com','3721', +'\.163\.com','netease', +'\.soso\.com','soso', +'\.zhongsou\.com','zhongsou', +'\.vnet\.cn','vnet', +'\.sogou\.com','sogou', +# Minor czech search engines +'atlas\.cz','atlas', +'seznam\.cz','seznam', +'quick\.cz','quick', +'centrum\.cz','centrum', +'jyxo\.(cz|com)','jyxo', +'najdi\.to','najdi', +'redbox\.cz','redbox', +'isearch\.avg\.com', 'avgsearch', +# Minor danish search-engines +'opasia\.dk','opasia', +'danielsen\.com','danielsen', +'sol\.dk','sol', +'jubii\.dk','jubii', +'find\.dk','finddk', +'edderkoppen\.dk','edderkoppen', +'netstjernen\.dk','netstjernen', +'orbis\.dk','orbis', +'tyfon\.dk','tyfon', +'1klik\.dk','1klik', +'ofir\.dk','ofir', +# Minor dutch search engines +'ilse\.','ilse', +'vindex\.','vindex', +# Minor english search engines +'bbc\.co\.uk/cgi-bin/search','bbc', +'ifind\.freeserve','freeserve', +'looksmart\.co\.uk','looksmartuk', +'splut\.','splut', +'spotjockey\.','spotjockey', +'ukdirectory\.','ukdirectory', +'ukindex\.co\.uk','ukindex', +'ukplus\.','ukplus', +'searchy\.co\.uk','searchy', +'search\.fbdownloader\.com','fbdownloader', +'search\.fdownloadr\.com', 'fdownloadr_com', +'search\.babylon\.com', 'babylon', +'my\.allgameshome\.com', 'allgameshome', +'surfcanyon\.com', 'surfcanyon_com', +'uk\.foxstart\.com', 'uk_foxstart_com', +'yandex\.com', 'yandex_com', +# Minor finnish search engines +'haku\.www\.fi','haku', +# Minor french search engines +'recherche\.aol\.fr','aolfr', +'ctrouve\.','ctrouve', +'francite\.','francite', +'\.lbb\.org','lbb', +'rechercher\.libertysurf\.fr','libertysurf', +'search[\w\-]+\.free\.fr','free', +'recherche\.club-internet\.fr','clubinternet', +'toile\.com','toile', +'biglotron\.com', 'biglotron', +'mozbot\.fr', 'mozbot', +# Minor german search engines +'sucheaol\.aol\.de','aolde', +'o2suche\.aol\.de','o2aolde', +'fireball\.de','fireball', +'infoseek\.de','infoseek', +'suche\d?\.web\.de','webde', +'[a-z]serv\.rrzn\.uni-hannover\.de','meta', +'suchen\.abacho\.de','abacho', +'(brisbane|suche)\.t-online\.de','t-online', +'allesklar\.de','allesklar', +'meinestadt\.de','meinestadt', +'212\.227\.33\.241','metaspinner', +'(161\.58\.227\.204|161\.58\.247\.101|212\.40\.165\.90|213\.133\.108\.202|217\.160\.108\.151|217\.160\.111\.99|217\.160\.131\.108|217\.160\.142\.227|217\.160\.176\.42)','metacrawler_de', +'wwweasel\.de','wwweasel', +'netluchs\.de','netluchs', +'schoenerbrausen\.de','schoenerbrausen', +'suche\.gmx\.net', 'gmxsuche', +'suche\.gmx\.at', 'gmxsuche_at', +'ecosia\.org', 'ecosiasearch', +'de\.aolsearch\.com', 'aolsearch', +'suche\.aol\.de', 'aolsuche', +'www\.startxxl\.com', 'startxxl', +'www\.benefind\.de', 'benefind', +'www\.amazon\.de.*search', 'amazonsearch', #Not clear if this matches amazon searches only +'de\.wow\.com', 'wowsearch', +'www\.vlips\.de', 'vlips_de', +'metager\.de', 'metager', +'search\.1und1\.de', 'search_1und1_de', +'sm\.de', 'smde', +'sumaja\.de', 'sumaja', +'navigationshilfe\.t-online\.de', 'navigationshilfe', +'umfis\.de', 'umfis', +'fastbot\.de', 'fastbot_de', +'tixuma\.de', 'tixuma_de', +'suche\.freenet\.de', 'freenet_de', +'www\.izito\.de', 'izito_de', +'extern\.peoplecheck\.de', 'peoplecheck_de', +'www\.oneseek\.de', 'oneseek_de', +'de\.wiki\.gov\.cn', 'de_wiki_gov_cn', +'umuwa\.de', 'umuwa_de', +'suche\.1und1\.de', '1und1_de', +'www\.metasuche\.ch', 'metasuche_ch', +# Minor Hungarian search engines +'heureka\.hu','heureka', +'vizsla\.origo\.hu','origo', +'lapkereso\.hu','lapkereso', +'goliat\.hu','goliat', +'index\.hu','indexhu', +'wahoo\.hu','wahoo', +'webmania\.hu','webmania', +'search\.internetto\.hu','internetto', +'tango\.hu','tango_hu', +'keresolap\.hu','keresolap_hu', +'kereso\.startlap\.hu', 'startlap_hu', +'polymeta\.hu','polymeta_hu', +# Minor Indian search engines +'sify\.com','sify', +# Minor Italian search engines +'virgilio\.it','virgilio', +'arianna\.libero\.it','arianna', +'supereva\.com','supereva', +'kataweb\.it','kataweb', +'search\.alice\.it\.master','aliceitmaster', +'search\.alice\.it','aliceit', +'gotuneed\.com','gotuneed', +'godado','godado', +'jumpy\.it','jumpy\.it', +'shinyseek\.it','shinyseek\.it', +'teecno\.it','teecnoit', +# Minor Israeli search engines +'search\.genieo\.com', 'genieo', +# Minor Japanese search engines +'ask\.jp','askjp', +'sagool\.jp','sagool', +'websearch\.rakuten\.co\.jp', 'rakuten', +# Minor Norwegian search engines +'sok\.start\.no','start', 'eniro\.no','eniro', +# Minor Polish search engines +'szukaj\.wp\.pl','wp', +'szukaj\.onet\.pl','onetpl', +'dodaj\.pl','dodajpl', +'gazeta\.pl','gazetapl', +'gery\.pl','gerypl', +'netsprint\.pl\/hoga\-search','hogapl', +'netsprint\.pl','netsprintpl', +'interia\.pl','interiapl', +'katalog\.onet\.pl','katalogonetpl', +'o2\.pl','o2pl', +'polska\.pl','polskapl', +'szukacz\.pl','szukaczpl', +'wow\.pl','wowpl', +# Minor russian search engines +'ya(ndex)?\.ru','yandex', +'aport\.ru','aport', +'rambler\.ru','rambler', +'turtle\.ru','turtle', +'metabot\.ru','metabot', +'go\.mail\.ru', 'mailru', +# Minor Swedish search engines +'evreka\.passagen\.se','passagen', +'eniro\.se','enirose', +# Minor Slovak search engines +'zoznam\.sk','zoznam', +# Minor Portuguese search engines +'sapo\.pt','sapo', +# Minor swiss search engines +'search\.ch','searchch', +'search\.bluewin\.ch','bluewin', +'www\.zapmeta\.ch', 'zapmeta_ch', +'etools\.ch', 'etools_ch', +# Minor Croatian, Serbian, Macedonian, Bosnian and Herzegovinian search engines +'pogodak\.','pogodak', +# Generic search engines +'search\..*\.\w+','search' +); + + +# SearchEnginesWithKeysNotInQuery +# List of search engines that store keyword as page instead of query parameter +#------------------------------------------------------------------------------ +%SearchEnginesWithKeysNotInQuery=( +'a9',1, # www.a9.com/searchkey1%20searchkey2 +'iminent',1, #http://start.iminent.com/StartWeb/1031/toolbox/#q=searchkey1%20searchkey2&additional_arguments +'de_wiki_gov_cn',1, #http://de.wiki.gov.cn/s_searchkey1%20searchkey2 +'umuwa_de', 1, #http://umuwa.de/searchkey or http://umuwa.de/searchkey/Images +'amazonsearch', 1 #http://www.amazon.de/gp/bit/apps/web/SERP/search/ref=bit_bds-p24_serp_cr_de?ie=UTF8tagbase=bds-p24&query=deutsch+8.+klasse+gymnasium+protokoll +); + +# SearchEnginesKnownUrl +# Known rules to extract keywords from a referrer search engine URL +#------------------------------------------------------------------------------ +%SearchEnginesKnownUrl=( +# Most common search engines +'alexa','q=', +'alltheweb','q(|uery)=', +'altavista','q=', +'a9','a9\.com\/', +'dmoz','search=', +'google_products','(p|q|as_p|as_q)=', +'google_base','(p|q|as_p|as_q)=', +'google_froogle','(p|q|as_p|as_q)=', +'google_groups','group\/', # does not work +'google_image','(p|q|as_p|as_q)=', +'google_cache','(p|q|as_p|as_q)=cache:[0-9A-Za-z]{12}:', +'google','(p|q|as_p|as_q)=', +'lycos','query=', +'msn','q=', +'live','q=', +'bing','q=', +'netscape','search=', +'tiscali','key=', +'aol','query=', +'terra','query=', +'voila','(kw|rdata)=', +'search.com','q=', +'yahoo_mindset','p=', +'yahoo','p=', +'sympatico', 'query=', +'excite','search=', +# Minor international search engines +'google4counter','(p|q|as_p|as_q)=', +'att','qry=', +'bungeebonesdotcom','query=', +'go','qt=', +'askde','(ask|q)=', # break out Ask country specific engines. +'askes','(ask|q)=', +'askfr','(ask|q)=', +'askit','(ask|q)=', +'asknl','(ask|q)=', +'ask','(ask|q)=', +'atomz','sp-q=', +'euroseek','query=', +'findarticles','key=', +'go2net','general=', +'hotbot','mt=', +'infospace','qkw=', +'kvasir', 'q=', +'looksmart','key=', +'mamma','query=', +'metacrawler','general=', +'nbci','keyword=', +'northernlight','qr=', +'overture','keywords=', +'dogpile', 'q(|kw)=', +'spray','string=', +'teoma','q=', +'webcrawler','searchText=', +'wisenut','query=', +'ixquick', 'query=', +'earthlink', 'q=', +'iune','(keywords|q)=', +'blingo','q=', +'centraldatabase','query=', +'clusty','query=', +'mysearch','searchfor=', +'vivisimo','query=', +# kartoo: No keywords passed in referring URL. +'kartoo','', +'icerocket','q=', +'sphere','q=', +'ledix','q=', +'shawca','q=', +'searchalot','q=', +'copernic','web\/', +'avantfind','keywords=', +'steadysearch','w=', +'clarosearch','q=', +'searchresults','q=', +'holasearch', 'q=', +'conduit', 'q=', +'flipora', 'q=', +'delta-search', 'q=', +'iminent', 'q=', +'searchmobileonline', 'q=', +'nortonsavesearch', 'q=', +'inbox', 'q(?:kw)?=', +'govome', 'q=', +'find1friend', 'q=', +'mysearchdial', 'q=', +'speedbit', 'q=', +'certifiedtoolbarsearch', 'q=', +'sweetim', 'q=', +'searchcompletion', 'q=', +'eazelsearch', 'q=', +'searchfunmoods', 'q=', +'googleByIP', 'q=', +'dalesearch', 'q=', +'sweetpacks', 'q=', +'searchgol', 'q=', +'duckduckgo', 'uddg=', +'facemoods', 'q=', +'shoppstop', 'keywords=', +'searchya', 'q=', +'picsearch', 'q=', +'webssearches', 'q=', +'zapmeta', 'query=', +'localmoxie', 'keyword=', +'search-results_mobi', 'q=', +'androidsearch', 'q=', +'isearch_nation_com', 'q=', +'search_zonealarm_com', 'q=', +'www_buenosearch_com', 'q=', +'search_foxtab_com', 'q=', +'searches_qone8_com', 'q=', +'startpage_com', 'query=', +'qwant_com', 'q=', +'safehomepage_com', 'q=', +'vi-view_com', 'q=', +'wow_utop_it', 'q=', +'windowssearch_com', 'q=', +'www_wow_com', 'q=', +'globososo', 'q=', +'swisscows_ch', 'query=', +'preciobarato_xyz', 's=', +'www_dregol_com', 'q=', +'search_socialdownloadr_com', 'q=', +'int_search_myway_com', 'searchfor=', +'de_dolphin_com', 'q=', +'mys_yoursearch_me', 'q=', +# Chello Portals +'chelloat','q1=', +'chellobe','q1=', +'chellocz','q1=', +'chellofr','q1=', +'chellohu','q1=', +'chellonl','q1=', +'chellono','q1=', +'chellopl','q1=', +'chellose','q1=', +'chellosk','q1=', +'chellocom','q1=', +# Mirago +'miragobe','(txtsearch|qry)=', +'miragoch','(txtsearch|qry)=', +'miragode','(txtsearch|qry)=', +'miragodk','(txtsearch|qry)=', +'miragoes','(txtsearch|qry)=', +'miragofr','(txtsearch|qry)=', +'miragoit','(txtsearch|qry)=', +'miragonl','(txtsearch|qry)=', +'miragono','(txtsearch|qry)=', +'miragose','(txtsearch|qry)=', +'miragocouk','(txtsearch|qry)=', +'mirago','(txtsearch|qry)=', +'answerbus','', # Does not provide query parameters +'icq','q=', +'nusearch','nusearch_terms=', +'goodsearch','Keywords=', +'scroogle','Gw=', # Does not always provide query parameters +'questionanswering','', +'mywebsearch','searchfor=', +'comettoolbar','qry=', +# Social Bookmarking Services +'delicious','all=', +'digg','s=', +'stumbleupon','', +'swik','swik\.net/', # does not work. Keywords follow domain, e.g. http://swik.net/awstats+analytics +'segnalo','', +'ineffabile','', +# Minor Australian search engines +'anzwers','search=', +# Minor brazilian search engines +'engine','p1=', 'miner','q=', +# Minor chinese search engines +'baidu','(wd|word)=', +'iask','(w|k)=', +'accoona','qt=', +'3721','(p|name)=', +'netease','q=', +'soso','q=', +'zhongsou','(word|w)=', +'sogou', 'query=', +'vnet','kw=', +# Minor czech search engines +'atlas','(searchtext|q)=', 'seznam','(w|q)=', 'quick','query=', 'centrum','q=', 'jyxo','(s|q)=', 'najdi','dotaz=', 'redbox','srch=', +'avgsearch', 'q=', +# Minor danish search engines +'opasia','q=', 'danielsen','q=', 'sol','q=', 'jubii','soegeord=', 'finddk','words=', 'edderkoppen','query=', 'orbis','search_field=', '1klik','query=', 'ofir','querytext=', +# Minor dutch search engines +'ilse','search_for=', 'vindex','in=', +# Minor english search engines +'askuk','(ask|q)=', 'bbc','q=', 'freeserve','q=', 'looksmartuk','key=', +'splut','pattern=', 'spotjockey','Search_Keyword=', 'ukindex', 'stext=', 'ukdirectory','k=', 'ukplus','search=', 'searchy', 'search_term=', +'fbdownloader','q=', +'fdownloadr_com', 'q=', +'babylon','q=', +'allgameshome', 's=', +'surfcanyon_com', 'q=', +'uk_foxstart_com', 'q=', +'yandex_com', 'text=', +# Minor finnish search engines +'haku','w=', +# Minor french search engines +'francite','name=', 'clubinternet', 'q=', +'toile', 'q=', +'biglotron','question=', +'mozbot','q=', +# Minor german search engines +'aolde','q=', +'o2aolde', 'q=', +'fireball','q=', 'infoseek','qt=', 'webde','su=', +'abacho','q=', 't-online','q=', +'metaspinner','qry=', +'metacrawler_de','qry=', +'wwweasel','q=', +'netluchs','query=', +'schoenerbrausen','q=', +'gmxsuche', 'q=', +'gmxsuche_at', 'q=', +'ecosiasearch', 'q=', +'aolsearch', 'q=', +'aolsuche', 'q=', +'startxxl', 'q=', +'benefind', 'q=', +'amazonsearch', 'query=', +'wowsearch', 'q=', +'vlips_de', 'q=', +'metager', 'eingabe=', +'search_1und1_de', 'q=', +'smde', 'q=', +#'sumaja', 'no query string available', #There is no query string in the referrer url +'navigationshilfe', 'q=', +'umfis', 'suchbegriff=', +'fastbot_de', 'red=[0-9]*\+', +'tixuma_de', 'sc=', +'freenet_de', 'query=', +'izito_de', 'q=', +'peoplecheck_de', 'q=', +'oneseek_de', 'q=', +'de_wiki_gov_cn', 'de\.wiki\.gov\.cn\/s_', +'umuwa_de', 'umuwa\.de\/', +'1und1_de', 'q=', +'metasuche_ch', 'q=', +# Minor Hungarian search engines +'heureka','heureka=', 'origo','(q|search)=', 'goliat','KERESES=', 'wahoo','q=', 'internetto','searchstr=', +'keresolap_hu','q=', +'startlap_hu', 'q=', +'tango_hu','q=', +'polymeta_hu','', +# Minor Indian search engines +'sify','keyword=', +# Minor Italian search engines +'virgilio','qs=', +'arianna','query=', +'supereva','q=', +'kataweb','q=', +'aliceitmaster','qs=', +'aliceit','qs=', +'gotuneed','', # Not yet known +'godado','Keywords=', +'jumpy\.it','searchWord=', +'shinyseek\.it','KEY=', +'teecnoit','q=', +# Minor Israeli search engines +'genieo','q=', +# Minor Japanese search engines +'askjp','(ask|q)=', +'sagool','q=', +'rakuten', 'qt=', +# Minor Norwegian search engines +'start','q=', 'eniro','q=', +# Minor Polish search engines +'wp','szukaj=', +'onetpl','qt=', +'dodajpl','keyword=', +'gazetapl','slowo=', +'gerypl','q=', +'hogapl','qt=', +'netsprintpl','q=', +'interiapl','q=', +'katalogonetpl','qt=', +'o2pl','qt=', +'polskapl','qt=', +'szukaczpl','q=', +'wowpl','q=', +# Minor russian search engines +'yandex', 'text=', 'rambler','words=', 'aport', 'r=', 'metabot', 'st=', +'mailru', 'q=', +# Minor swedish search engines +'passagen','q=', +'enirose', 'hitta:', #Not sure if this works, as the keywords are part of the URL, and therefore the URL does not contain a question mark. +# Minor swiss search engines +'searchch', 'q=', 'bluewin', 'qry=', +'zapmeta_ch', 'query=', +'etools_ch', 'query=', +# Minor Croatian, Serbian, Macedonian, Bosnian and Herzegovinian search engines +'pogodak', 'q=' +); + +# SearchEnginesKnownUrlNotFound +# Known rules to extract not found keywords from a referrer search engine URL +#------------------------------------------------------------------------------ +%SearchEnginesKnownUrlNotFound=( +# Most common search engines +'msn','origq=' +); + +# If no rules are known, we take first paramater not into WordsToCleanSearchUrl +#------------------------------------------------------------------------------ +@WordsToCleanSearchUrl= ('act=','annuaire=','btng=','cat=','categoria=','cfg=','cof=','cou=','count=','cp=','dd=','domain=','dt=','dw=','enc=','exec=','geo=','hc=','height=','hits=','hl=','hq=','hs=','id=','kl=','lang=','loc=','lr=','matchmode=','medor=','message=','meta=','mode=','order=','page=','par=','pays=','pg=','pos=','prg=','qc=','refer=','sa=','safe=','sc=','sort=','src=','start=','style=','stype=','sum=','tag=','temp=','theme=','type=','url=','user=','width=','what=','\\.x=','\\.y=','y=','look='); + +# SearchEnginesKnownUTFCoding +# Known parameter that proves a search engine has coded its parameters in UTF-8 +#------------------------------------------------------------------------------ +%SearchEnginesKnownUTFCoding=( +# Most common search engines +'google','ie=utf-8', +'alltheweb','cs=utf-8' +); + + +# SearchEnginesHashLib +# List of search engines names +# 'search_engine_id', 'search_engine_name', +#------------------------------------------------------------------------------ +%SearchEnginesHashLib=( +# Major international search engines +'alexa','Alexa', +'alltheweb','AllTheWeb', +'altavista','AltaVista', +'a9', 'A9', +'dmoz','DMOZ', +'google_products','Google (Products)', +'google_base','Google (Base)', +'google_froogle','Froogle (Google)', +'google_groups','Google (Groups)', +'google_image','Google (Images)', +'google_cache','Google (cache)', +'google','Google', +'lycos','Lycos', +'msn','Microsoft MSN Search', +'live','Microsoft Windows Live', +'bing','Microsoft Bing', +'netscape','Netscape', +'aol','AOL', +'terra','Terra', +'tiscali','Tiscali', +'voila','Voila', +'search.com','Search.com', +'yahoo_mindset','Yahoo! Mindset', +'yahoo','Yahoo!', +'sympatico','Sympatico', +'excite','Excite', +# Minor international search engines +'google4counter','4-counter (Google)', +'att','AT&T search (powered by Google)', +'bungeebonesdotcom','BungeeBones', +'go','Go.com', +'askde','Ask Deutschland', +'askes','Ask España', # break out Ask country specific engines. +'askfr','Ask France', +'askit','Ask Italia', +'asknl','Ask Nederland', +'ask','Ask', +'atomz','Atomz', +'dejanews','DejaNews', +'euroseek','Euroseek', +'findarticles','Find Articles', +'go2net','Go2Net (Metamoteur)', +'hotbot','Hotbot', +'infospace','InfoSpace', +'kvasir','Kvasir', +'looksmart','Looksmart', +'mamma','Mamma', +'metacrawler','MetaCrawler (Metamoteur)', +'nbci','NBCI', +'northernlight','NorthernLight', +'overture','Overture', # Replace 'goto\.com','Goto.com', +'dogpile','Dogpile', +'spray','Spray', +'teoma','Teoma', # Replace 'directhit\.com','DirectHit', +'webcrawler','WebCrawler', +'wisenut','WISENut', +'ixquick','ix quick', +'earthlink', 'Earth Link', +'iune','i-une', +'blingo','Blingo', +'centraldatabase','GPU p2p search', +'clusty','Clusty', +'mysearch','My Search', +'vivisimo','Vivisimo', +'kartoo','Kartoo', +'icerocket','Icerocket (Blog)', +'sphere','Sphere (Blog)', +'ledix','Ledix', +'shawca','Shaw.ca', +'searchalot','Searchalot', +'copernic','Copernic', +'avantfind','Avantfind', +'steadysearch','Avantfind', +'clarosearch','Claro Search', +'searchresults','Search-results', +'holasearch', 'Hola Search', +'conduit', 'Conduit Search', +'flipora', 'Flipora', +'delta-search', 'Delta Search', +'iminent', 'Iminent', +'searchmobileonline', 'Search Mobile Online (StartApp)', +'nortonsavesearch', 'Norton Safe Search', +'inbox', 'Inbox Search', +'govome', 'Govome', +'find1friend', 'Find1Friend', +'mysearchdial', 'My Search Dial', +'speedbit', 'Speedbit', +'certifiedtoolbarsearch', 'Certified-Toolbar Search', +'sweetim', 'SweetIM Search', +'searchcompletion', 'SearchCompletion Search', +'eazelsearch', 'Eazel Search', +'searchfunmoods', 'Funmoods', +'googleByIP', 'Google (Access by IP-Address)', +'dalesearch', 'Dale Search', +'sweetpacks', 'Sweetpacks', +'searchgol', 'Search-Gol', +'duckduckgo', 'DuckDuckGo (Does not provide search keyphrases, using found page instead)', +'facemoods', 'Facemoods Search', +'shoppstop', 'ShoppStop', +'searchya', 'Searchya', +'picsearch', 'picsearch', +'webssearches', 'Various variants of Webssearches EMG Technologies and airzip.inspsearch.com', +#Jan 8, 2016: No genuine inspsearch.com search engine seems so exist, but there is a couple of search engines using subdomains of inspsearch.com. Unclear how these are related to each other. +'zapmeta', 'ZapMeta', +'localmoxie', 'Local Moxie', +'search-results_mobi', 'search-results.mobi', +'androidsearch', 'androidsearch.com', +'isearch_nation_com', 'Nation Search', +'search_zonealarm_com', 'Zone Alarm Search', +'www_buenosearch_com', 'BuenoSearch', +'search_foxtab_com', 'Foxtab Search', +'searches_qone8_com', 'Omiga-Plus', +'startpage_com', 'Startpage', +'qwant_com', 'qwant.com', +'safehomepage_com', 'safehomepage.com', +'vi-view_com', 'vi-view.com', +'wow_utop_it', 'wow.utop.it', +'windowssearch_com', 'windowssearch.com', +'www_wow_com', 'WOW.com', +'globososo', 'Various variants of Globososo (Kingtale Technology): www, searches, searches3, and at inspsearch.com (globososo, kingtale3)', +'swisscows_ch', 'Swisscows', +'preciobarato_xyz', 'Yandex', +'www_dregol_com', 'Dregol Search', +'search_socialdownloadr_com', 'Socialdownloadr', +'int_search_myway_com', 'MyWay', +'de_dolphin_com', 'Dolphin Search', +'mys_yoursearch_me', 'Yoursearch.me', +# Chello Portals +'chelloat','Chello Austria', +'chellobe','Chello Belgium', +'chellocz','Chello Czech Republic', +'chellofr','Chello France', +'chellohu','Chello Hungary', +'chellonl','Chello Netherlands', +'chellono','Chello Norway', +'chellopl','Chello Poland', +'chellose','Chello Sweden', +'chellosk','Chello Slovakia', +'chellocom','Chello (Country not recognized)', +# Mirago +'miragobe','Mirago Belgium', +'miragoch','Mirago Switzerland', +'miragode','Mirago Germany', +'miragodk','Mirago Denmark', +'miragoes','Mirago Spain', +'miragofr','Mirago France', +'miragoit','Mirago Italy', +'miragonl','Mirago Netherlands', +'miragono','Mirago Norway', +'miragose','Mirago Sweden', +'miragocouk','Mirago UK', +'mirago','Mirago (country unknown)', +'answerbus','Answerbus', +'icq','icq', +'nusearch','Nusearch', +'goodsearch','GoodSearch', +'scroogle','Scroogle', +'questionanswering','Questionanswering', +'mywebsearch','MyWebSearch', +'comettoolbar','Comet toolbar search', +# Social Bookmarking Services +'delicious','del.icio.us (Social Bookmark)', +'digg','Digg (Social Bookmark)', +'stumbleupon','Stumbleupon (Social Bookmark)', +'swik','Swik (Social Bookmark)', +'segnalo','Segnalo (Social Bookmark)', +'ineffabile','Ineffabile.it (Social Bookmark)', +# Minor Australian search engines +'anzwers','anzwers.com.au', +# Minor brazilian search engines +'engine','Cade', 'miner','Meta Miner', +# Minor chinese search engines +'baidu','Baidu', +'iask','Iask', +'accoona','Accoona', +'3721','3721', +'netease', 'NetEase', +'soso','SoSo', +'zhongsou','ZhongSou', +'sogou', 'SoGou', +'vnet','VNet', +# Minor czech search engines +'atlas','Atlas.cz', 'seznam','Seznam', 'quick','Quick.cz', 'centrum','Centrum.cz', 'jyxo','Jyxo.cz', 'najdi','Najdi.to', 'redbox','RedBox.cz', +'avgsearch', 'AVG Secure Search', +# Minor danish search-engines +'opasia','Opasia', 'danielsen','Thor (danielsen.com)', 'sol','SOL', 'jubii','Jubii', 'finddk','Find', 'edderkoppen','Edderkoppen', 'netstjernen','Netstjernen', 'orbis','Orbis', 'tyfon','Tyfon', '1klik','1Klik', 'ofir','Ofir', +# Minor dutch search engines +'ilse','Ilse','vindex','Vindex\.nl', +# Minor english search engines +'askuk','Ask UK', +'bbc','BBC', 'freeserve','Freeserve', 'looksmartuk','Looksmart UK', +'splut','Splut', 'spotjockey','Spotjockey', 'ukdirectory','UK Directory', 'ukindex','UKIndex', 'ukplus','UK Plus', 'searchy','searchy.co.uk', +'fbdownloader','FBDownloader (fbdownloader)', +'fdownloadr_com', 'FBDownloader (fdownloadr)', +'babylon','Babylon', +'allgameshome', 'AllGamesHome', +'surfcanyon_com', 'SurfCanyon', +'uk_foxstart_com', 'Foxstart.com', +'yandex_com', 'Yandex', +# Minor finnish search engines +'haku','Ihmemaa', +# Minor french search engines +'aolfr','AOL (fr)', 'ctrouve','C\'est trouve', 'francite','Francite', 'lbb', 'LBB', 'libertysurf', 'Libertysurf', 'free', 'Free.fr', 'clubinternet', 'Club-internet', +'toile', 'Toile du Quebec', +'biglotron','Biglotron', +'mozbot','Mozbot', +# Minor German search engines +'aolde','AOL (de)', +'o2aolde', 'o2 Suche', +'fireball','Fireball', 'infoseek','Infoseek', +'webde','Web.de', +'abacho','Abacho', +'t-online','T-Online', +'allesklar','allesklar.de', 'meinestadt','meinestadt.de', +'metaspinner','metaspinner', +'metacrawler_de','metacrawler.de', +'wwweasel','WWWeasel', +'netluchs','Netluchs', +'schoenerbrausen','Schoenerbrausen/', +'gmxsuche', 'GMX Suche', +'gmxsuche_at', 'GMX Suche Oesterreich', +'ecosiasearch', 'Ecosia Search', +'aolsearch', 'AOL Search', +'aolsuche', 'AOL Suche', +'startxxl', 'StartXXL', +'benefind', 'benefind', +'amazonsearch', 'Amazon Web Search', +'wowsearch', 'Wow Search', +'vlips_de', 'vlips.de', +'metager', 'MetaGer', +'search_1und1_de', '1&1 Suche (subdomain "search")', +'smde', 'SM.de - Die SuchMaschine', +'sumaja', 'Sumaja', +'navigationshilfe', 'T-Online Navigationshilfe', +'umfis', 'UMFIS-Online Das Umweltfirmen-Informationssystem der IHKs in Deutschland', +'fastbot_de', 'Fastbot.de (Does not provide search keyphrases, using found page instead)', +'tixuma_de', 'Tixuma Deutschland', +'freenet_de', 'suche.freenet.de', +'izito_de', 'iZito Deutschland', +'peoplecheck_de', 'PeopleCheck.de', +'oneseek_de', 'Metasuchmaschine OneSeek.de', +'de_wiki_gov_cn', 'Wiki Sucher', +'umuwa_de', 'Umuwa Deutschland', +'1und1_de', '1&1 Suche (subdomain "suche")', +'metasuche_ch', 'Metasuche.ch', +# Minor hungarian search engines +'heureka','Heureka', 'origo','Origo-Vizsla', 'lapkereso','Startlapkereso', 'goliat','Goliat', 'indexhu','Index', 'wahoo','Wahoo', 'webmania','webmania.hu', 'internetto','Internetto Kereso', +'tango_hu','Tango', +'keresolap_hu','Tango keresolap', +'startlap_hu','Startlab Kereso', +'polymeta_hu','Polymeta', +# Minor Indian search engines +'sify','Sify', +# Minor Italian search engines +'virgilio','Virgilio', +'arianna','Arianna', +'supereva','Supereva', +'kataweb','Kataweb', +'aliceitmaster','search.alice.it.master', +'aliceit','alice.it', +'gotuneed','got u need', +'godado','Godado.it', +'jumpy\.it','Jumpy.it', +'shinyseek\.it','Shinyseek.it', +'teecnoit','Teecno', +# Minor Israeli search engines +'genieo','Genieo', +# Minor Japanese search engines +'askjp','Ask Japan', +'sagool','Sagool', +'rakuten', 'websearch.rakuten.co.jp', +# Minor Norwegian search engines +'start','start.no', 'eniro','Eniro', +# Minor polish search engines +'wp','Wirtualna Polska', +'onetpl','Onet.pl', +'dodajpl','Dodaj.pl', +'gazetapl','Gazeta.pl', +'gerypl','Gery.pl', +'hogapl','Hoga.pl', +'netsprintpl','NetSprint.pl', +'interiapl','Interia.pl', +'katalogonetpl','Katalog.Onet.pl', +'o2pl','o2.pl', +'polskapl','Polska', +'szukaczpl','Szukacz', +'wowpl','Wow.pl', +# Minor russian search engines +'yandex', 'Yandex', 'aport', 'Aport', 'rambler', 'Rambler', 'turtle', 'Turtle', 'metabot', 'MetaBot', +'mailru','Mail.Ru', +# Minor Swedish search engines +'passagen','Evreka', +'enirose','Eniro Sverige', +# Minor Slovak search engines +'zoznam','Zoznam', +# Minor Portuguese search engines +'sapo','Sapo', +# Minor Swiss search engines +'searchch', 'search.ch', 'bluewin', 'search.bluewin.ch', +'zapmeta_ch', 'ZapMeta.ch', +'etools_ch', 'eTools.ch', +# Minor Croatian, Serbian, Macedonian, Bosnian and Herzegovinian search engines +'pogodak','Pogodak.com', +# Generic search engines +'search','Unknown search engines' +); + + +# Sanity check. +# Enable this code and run perl search_engines.pm to check file entries are ok +#----------------------------------------------------------------------------- +#foreach my $key (@SearchEnginesSearchIDOrder_list1) { +# if (! $SearchEnginesHashID{$key}) { error("Entry '$key' has been found in SearchEnginesSearchIDOrder_list1 with no value in SearchEnginesHashID"); +# foreach my $key2 (@SearchEnginesSearchIDOrder_list2) { if ($key2 eq $key) { error("$key is in 1 and 2\n"); } } +# foreach my $key2 (@SearchEnginesSearchIDOrder_listgen) { if ($key2 eq $key) { error("$key is in 1 and gen\n"); } } +#} } +#foreach my $key (@SearchEnginesSearchIDOrder_list2) { +# if (! $SearchEnginesHashID{$key}) { error("Entry '$key' has been found in SearchEnginesSearchIDOrder_list1 with no value in SearchEnginesHashID"); +# foreach my $key2 (@SearchEnginesSearchIDOrder_list1) { if ($key2 eq $key) { error("$key is in 2 and 1\n"); } } +# foreach my $key2 (@SearchEnginesSearchIDOrder_listgen) { if ($key2 eq $key) { error("$key is in 2 and gen\n"); } } +#} } +#foreach my $key (@SearchEnginesSearchIDOrder_listgen) { if (! $SearchEnginesHashID{$key}) { error("Entry '$key' has been found in SearchEnginesSearchIDOrder_listgen with no value in SearchEnginesHashID"); } } +#foreach my $key (keys %NotSearchEnginesKeys) { if (! $SearchEnginesHashID{$key}) { error("Entry '$key' has been found in NotSearchEnginesKeys with no value in SearchEnginesHashID"); } } +#foreach my $key (keys %SearchEnginesKnownUrl) { +# my $found=0; +# foreach my $key2 (values %SearchEnginesHashID) { +# if ($key eq $key2) { $found=1; last; } +# } +# if (! $found) { die "Entry '$key' has been found in SearchEnginesKnownUrl with no value in SearchEnginesHashID"; } +#} +#foreach my $key (keys %SearchEnginesHashLib) { +# my $found=0; +# foreach my $key2 (values %SearchEnginesHashID) { +# if ($key eq $key2) { $found=1; last; } +# } +# if (! $found) { die "Entry '$key' has been found in SearchEnginesHashLib with no value in SearchEnginesHashID"; } +#} +#print @SearchEnginesSearchIDOrder_list1." ".@SearchEnginesSearchIDOrder_list2." ".@SearchEnginesSearchIDOrder_listgen; + +1;