We have clarified our Privacy Statement even further. Please have a look at our changes.
Browse Source

+ added noRefererActions and whois crawler network detection

git-svn-id: svn+ssh://svn.izzysoft.de/public/histview/trunk@58 2e17d4db-7539-0410-9bcc-c6eb479cd550
tags/v0.1.9
Izzy 10 years ago
parent
commit
c8640526a4
5 changed files with 129 additions and 5 deletions
  1. +104
    -5
      class.download.inc
  2. +12
    -0
      class.hvconfig.inc
  3. +10
    -0
      histview.hist
  4. +2
    -0
      histview_crawlernets
  5. +1
    -0
      histview_ignorebots

+ 104
- 5
class.download.inc View File

@@ -27,6 +27,7 @@ class download extends hvconfig {
var $argsep = ";";
var $ignored_bots = array(); // Array[0..n] of strings (substr of bot names)
var $rejected_bots = array(); // Array[0..n] of strings (substr of bot names)
var $crawler_nets = array(); // Array[0..n] of strings (substr of net names)
var $remote_ua = "";

/** Initial setup of default filetypes and date format
@@ -45,6 +46,7 @@ class download extends hvconfig {
$this->basedir = $basedir;
if (!empty($icondir)) $this->icondir = $icondir;
$this->remote_ua = strtolower($_SERVER["HTTP_USER_AGENT"]);
$this->set_bots("crawler",$this->crawlerfile);
$this->set_bots("ignore",$this->ignorefile);
$this->set_bots("reject",$this->rejectfile);
if (is_array($this->db)) { // setup default db
@@ -65,6 +67,7 @@ class download extends hvconfig {
$bot = trim($bot);
if (empty($bot) || substr($bot,0,1)=="#") continue;
switch ($type) {
case "crawler" : $this->crawler_net($bot); break;
case "ignore" : $this->ignore_bot($bot); break;
case "reject" : $this->reject_bot($bot); break;
}
@@ -172,6 +175,11 @@ class download extends hvconfig {
elseif (is_string($bot)) $this->rejected_bots[] = $bot;
else trigger_error("Cannot add this bot(s) - wrong data type.",E_USER_WARNING);
break;
case "crawler" :
if (is_array($bot)) $this->crawler_nets = array_merge($this->crawler_nets,$bot);
elseif (is_string($bot)) $this->crawler_nets[] = $bot;
else trigger_error("Cannot add this bot(s) - wrong data type.",E_USER_WARNING);
break;
}
}

@@ -191,6 +199,14 @@ class download extends hvconfig {
$this->add_bot($bot,"reject");
}

/** Add name to crawler list
* @method crawler_net
* @param mixed name string or array of strings (substr of botnames)
*/
function crawler_net($bot) {
$this->add_bot($bot,"crawler");
}

/** Scan another directory
* Switches to another directory. This resets the internal file array and starts over.
* The filelist array will not be reset, so entries are added to it.
@@ -348,6 +364,92 @@ class download extends hvconfig {
}
}

/** Reject a client and quit
* @method reject
*/
function reject() {
header($this->rejectheader);
echo $this->rejectmsg;
exit;
}

/** Handle request w/o referer
* @method refererCheck
*/
function refererCheck() {
if (!empty($_SERVER['HTTP_REFERER'])) return;
switch ($this->noRefererAction) { // pass, deny, whois
case "deny" : $this->reject(); break;
case "whois": if ($this->is_crawler_net) $this->reject(); break;
case "pass" :
default : break;
}
}

/** Query the whois database
* @method whois
* @param string host IP/hostname
* @param optional boolean raw whether to include the raw whois information (default: FALSE)
* @param optional string server whois server to use (default: "whois.arin.net")
* @return array whois
*/
function whois($host,$raw=FALSE,$server="whois.arin.net") {
if (empty($host)) return false;
$fp=@fsockopen($server,43,&$errno,&$errstr,15);
if(!$fp) {
trigger_error("Could not establish connection to whois server $server:43",E_USER_NOTICE);
return false;
}
fputs($fp,"$host\r\n");
$resp = '';
while(!feof($fp)) $resp .= fgets($fp,256);
fclose($fp);
$arr = explode("\n",$resp);
foreach ($arr as $item) {
$item = trim($item);
if ( empty($item) || substr($item,0,1)=='%' || substr($item,0,1)=='#') continue;
$pair = explode(':',$item);
if (isset($pair[2])) $pair[1].=':'.$pair[2];
switch(trim($pair[0])) {
case "remarks":
case "source" :
case "tech-c" :
case "admin-c":
case "mnt-ref":
case "mnt-by" :
case "address":
case "e-mail" : $whois[trim($pair[0])][] = trim($pair[1]); break;
default : $whois[trim($pair[0])] = trim($pair[1]); break;
}
if ($raw) $whois["raw"] = $resp;
}
if (!empty($whois["ReferralServer"])) {
$refServer = $whois["ReferralServer"];
if (substr($refServer,0,8)=='whois://') $refServer = substr($refServer,8);
if ($server=="whois.arin.net") $whois = whois($host,$raw,$refServer);
}
return $whois;
}

/** Network check (where does the request come from)
* @method is_crawler_net
* @return boolean
*/
function is_crawler_net() {
$whois = $this->whois($_SERVER['REMOTE_ADDR']);
$netname = strtolower($whois['netname']);
$orgname = strtolower($whois['OrgName']);
$orgNocName = strtolower($whois['OrgNOCName']);
$desc = strtolower($whois['descr']);
foreach ($this->crawler_nets as $bot) {
if (strpos($netname,$bot)!==FALSE) return TRUE;
if (strpos($desc,$bot)!==FALSE) return TRUE;
if (strpos($orgname,$bot)!==FALSE) return TRUE;
if (strpos($orgNocName,$bot)!==FALSE) return TRUE;
}
return FALSE;
}

/** Download a file (if the remote UA is not in the reject list)
* @method sendfile
* @param string filename File to download
@@ -356,12 +458,9 @@ class download extends hvconfig {
*/
function sendfile($fname,$dir="") {
foreach ($this->rejected_bots as $ua) {
if (strpos($this->remote_ua,strtolower($ua))!==FALSE) { // kick off bots
header($this->rejectheader);
echo $this->rejectmsg;
exit;
}
if (strpos($this->remote_ua,strtolower($ua))!==FALSE) $this->reject(); // kick off bots
}
$this->refererCheck();
if (empty($dir)) $dir = $this->dir;
$file = $dir."/".$fname;
if (!file_exists($file)) return FALSE;

+ 12
- 0
class.hvconfig.inc View File

@@ -176,6 +176,18 @@ class hvconfig {
*/
$this->max_relnum = 9;
//===================================================[ Download Settings ]===
/** Action for requests w/o referer
* Valid actions are: "pass" (no action), "deny" (reject request), "whois"
* (network check - and reject crawler networks)
* @class hvconfig
* @attribute string noRefererAction
*/
$this->noRefererAction = "pass";
/** Read crawler network names from a file
* @class hvconfig
* @attribute string crawlerfile
*/
$this->crawlerfile = dirname(__FILE__)."/histview_crawlernets";
/** Read bots to ignore from a file
* @class hvconfig
* @attribute string ignorefile

+ 10
- 0
histview.hist View File

@@ -3,6 +3,16 @@ $Id$
History for HistView
====================

v0.1.7
-------------------
+ adding referer check. Options for noRefererAction (empty referer) include
"pass", "deny", and "whois" (check for crawler/bot nets)
+ adding remote network check. Especially some MS crawlers fake their UAs. But
since they always leave the referer empty, we can limit the network checks
to those. Use the histview_crawlernets file to specify their network information
(one spec per line, all lowercase). These will be checked against the whois
information for 'netname', 'OrgName', 'OrgNOCName', and 'descr'.

v0.1.6 (01.01.2009)
-------------------
+ adding the possibility to reject file requests based on user agent (or to

+ 2
- 0
histview_crawlernets View File

@@ -0,0 +1,2 @@
microsoft
search media inc

+ 1
- 0
histview_ignorebots View File

@@ -1,3 +1,4 @@
freshmeat.net URI validator
Softonic Link Checker
ABCDbot
IEAutoDiscovery

Loading…
Cancel
Save