<?
/*
Anti spidering
Idea -
count access from Class_C_NET(IP)
block access when we have too many accesses
allow good spiders/proxies - google, bing, att, t-mobile, ...
divide people into several groups
* admin - no protection
* google - no protection
* usual visitors (unknown IPs, low limit)
* registered visitors (unknown IPs, high limit)
* offenders (known IPs - kept in memcached)
* big-known-offenders (known IPs - kept in common.yaml file) C("ip.blacklist")
In order to minimize memory use we use 2-layer accounting
* soft - basic accounting, identify offenders - pass offenders for hard limiting
* reg - registered account accounting
* hard - medium-time accounting (default 10hr), once hard-limit reached, IP is saved to known-offender list, such IP should be perma-banned in nginx
Known offender check is only performed when soft limit is hit
Usage (simple):
if (i("anti-spider")->hit()) {
header("HTTP/1.1 403 Request Limit Reached");
die;
}
Usage (captcha):
if (i("anti-spider")->captchaOrDie()) // can DIE! when blocking
return "captcha";
References:
re-captcha FAQ: https://developers.google.com/recaptcha/docs/faq
re-captcha Doc: https://developers.google.com/recaptcha/intro
*/
class Antispider {
static $generation = 3; // increase every time you change rules
/*
name: (string)
soft-limit - when to PAY-ATTENTION (not an actual limit at all)
soft_limit/soft_timeout: nn accesses per rolling soft-timeout
captcha_limit/captcha_timeout: nn accesses before captcha
reg_limit/soft_timeout: nn reg-accesses per rolling soft-timeout
once soft limit is reached - we are paying much more attention and do actual limiting
hard_limit/hard_timeout: nn accesses per rolling hard-timeout (ACTUAL LIMIT)
*/
public $C; // config: k => v
public $hit = null; // inialized after hit() call
public $verdict = []; // verdict = $verdict[0] ; inialized after hit() call
static $DEFAULT = [
'soft_limit' => 160,
'soft_timeout' => 300, // 5 Min - THIS IS NOT A BLOCKING TIMEOUT !!
'captcha_limit' => 500, // SHOULD be greater than soft_limit
'captcha_timeout' => 18000, // 5 hours ~same as in ::check()
'reg_limit' => 750,
'hard_limit' => 900,
'hard_timeout' => 36000, // 10 hours
// IMPORTANT!
// This is default config. It uses/shares radris.com recaptcha account.
// you should put proper credentials to config.yaml
// ex:
// antispider:
// v2:
// google_recaptcha_sitekey: 6LfzVwUTAAAAAIwM66sPa3AXjkm9nsi2Vr7WZnqd
// google_recaptcha_secret: 6LfzVwUTAAAAAFbZ0V9A-6A1zlOkjJ-GCRYh5du6
#'google_recaptcha_sitekey' => '6LfzVwUTAAAAAIwM66sPa3AXjkm9nsi2Vr7WZnqd', <<<<< default key/secret is BAD idea. Easy to misuse default key for inappropriate site.
#'google_recaptcha_secret' => '6LfzVwUTAAAAAFbZ0V9A-6A1zlOkjJ-GCRYh5du6',
'google_recaptcha_url' => 'https://www.google.com/recaptcha/api/siteverify',
];
/**
* to be used in Actions:
*
* ask user to do captcha OR terminate-script(block) user
*
* if (i("anti-spider")->captchaOrDie())
* return "captcha";
*/
PUBLIC function captchaOrDie() : bool {
if ($verdict == "captcha") {
header('HTTP/1.1 429 Too Many Requests');
return true; // "captcha";
}
// WE DO NOT NEED this reporting already have this in hit()
/*
if (! $this->captcha_blacklist_flag) {
$cnt = once("addr-block-".(ip() >> 12), 3600*4); // ~ COUNT `B..C` NETWORK
if ($cnt > 10)
\Log::notice("blocking count=$cnt pfl\nagent: ".$_SERVER['HTTP_USER_AGENT']);
}
*/
Profiler::disable();
header("HTTP/1.1 403 Request Limit Reached");
die("Access Denied - Request Limit Reached");
}
return false;
}
/**
* Protect Sensitive Pages, like pages with API calls
*
* same idea as captchaOrDie(), just much more strict rules
*
* reports statistic to iStat - nodes: "sensitiveHit-***"
* reports offenders to log and Slack #prod-alerts channel
*
* Important! : method MUST be called from ACTION ONLY (before we sent any htmls)
*
* Usage:
* i("anti-spider")->sensitiveHitOrDie();
* Usage 2:
* [$block, $reason] = i("anti-spider")->sensitiveHitOrDie(stopScript:false);
* if ($block) {
* return "/template/access-limit-reached";
* }
* ... continue with a script
*
* @return string $reason_to_block or ""
*/
PUBLIC function sensitiveHitOrDie(int $timeout = 1800, int $limit = 20, $stopScript = true) : array {
if (Debug::is_admin()) {
\Profiler::info("anti-spider::sensitiveHit", "check suppressed");
$this->iStat('sensitiveHit-admin');
return [0, "admin"];
}
$ip = ip();
if (self::isGoodSpider($ip)) { # isGoodIP
$this->iStat('sensitiveHit-good_ip');
return [0, "good-ip"];
}
$log = function ($reason, $cnt) use ($ip, $limit) {
if ($cnt == $limit || ($cnt % 100 == 0)) { # report Fist Time and every 100 accesses
\Log::notice("Antispider::sensitiveHit block:$reason cnt=$cnt"); // ip and script name shown by Log method
$log_cnt = 1;
# !!!! for NOW we'll report ALL, remove comments to report once every minute
#if ($log_cnt = i('cache')->onceCount("sensitiveHit-report", 600)) {
$r_ip = long2ip($ip);
$text = "Antispider *BLOCKING* Sensitive Hit: $reason ip=$r_ip".cs($log_cnt>1, "; blocks-skipped: $log_cnt");
\HB::slackPost(['channel' => '#prod-alerts', 'text' => $text]);
#}
}
};
if ($stopScript) {
$tryStop = function($reason, $cnt) use($log) {
$log($reason, $cnt);
header('HTTP/1.1 429 Too Many Requests');
Profiler::disable();
die;
return [1, $reason]; // satisfy return
};
} else {
$tryStop = function($reason, $cnt) use ($log) {
$log($reason, $cnt);
return [1, $reason]; // satisfy return
};
}
$netName = \utils\IP::classifyIP($ip, "blacklist");
if ($netName) {
$this->iStat('sensitiveHit-blacklist');
return $tryStop("blacklisted-ip:$netName", $cnt);
}
$key = "anti-spider".self::$generation."s.hit".$ip; // Counter based on IP
# v($key);
$cnt = \Cache::inc($key, 1, $timeout);
if ($cnt < $limit) {
$this->iStat('sensitiveHit-allow');
return [0, "count=$cnt < $limit"];
}
// we got an offender !!
$this->iStat('sensitiveHit-deny');
return $tryStop("count=$cnt >= $limit", $cnt);
}
/**
*
* to be used in Actions:
* hide adsense for stupid Robots
*
* Adsense Condition:
* several queries to our site, still no `referrer`
* always show adsense to Google
*
* Usage:
* $this["G.NO_ADSENSE"] = ! i("anti-spider")->canShowAdsense(); // attn to "!"
*
*/
PUBLIC function canShowAdsense() : bool {
if ($_SERVER["HTTP_REFERER"] ?? 0)
return true;
if ($this->hit()) // no adsense for blacklisted, also initialize $this->hit
return false;
if ($this->hit > 3) {
if ($this->isGoogleIP(ip()))
return true;
return false; // no adsense for stupid robots (still no referer)
}
return true;
}
private function hitCounterKey() {
$net = ip() & 0xffffff00;
return "anti-spider".self::$generation."-".date("d")."/net$net"; // Common Counter based on NET (resets every day)
}
// "captcha" - soft block (not a real block)
// "block" - just block (offender)
PUBLIC function hit() { # null - OK, "block", "captcha"
// duplicate call elimination, call caching
if ($this->verdict)
return $this->verdict[0];
$this->verdict[0] = $this->_hit(); // using array to store null
return $this->verdict[0];
}
private function _hit() { # null - OK, "block", "captcha"
if (Debug::is_admin() ||
(Debug::is_x_admin() && starts_with($_SERVER["HTTP_X_REAL_IP"] ?? "", "172.16.8"))
) { // donot block admins and ahm/ajax requests and webtests from local network
Profiler::info("anti-spider", "BYPASS");
$this->iStat('bypass');
return;
}
$k = $this->hitCounterKey();
$cnt = (int)Cache::get($k) + 1;
$this->hit = $cnt;
// give some slack for ajax queries (mostly completers)
$is_ajax = CD("AJAX");
if (starts_with($_SERVER['REQUEST_URI']??"", "/srv/"))
$is_ajax = 1;
if (starts_with($_SERVER['REQUEST_URI']??"", "/ng/srv/"))
$is_ajax = 1;
if ($cnt > 2 && $is_ajax) {
if (random_int(0, 400) > 5) { // 2.5%
$this->iStat('ajax-bypass');
if ($cnt < $this->C['captcha_limit'])
return;
$cnt--; // redo $cnt++ in 2.5% cases
}
}
Profiler::info("anti-spider", "key:$k, hits:$cnt");
$this->iStat('requests');
if ($cnt < $this->C['soft_limit']) {
// allow 30x times ajax queries (usually completers)
Cache::put($k, $cnt, $this->C['soft_timeout']);
$this->iStat('no-captcha');
return;
}
// REGISTERED User Limits
if (id() && $cnt < $this->C['reg_limit']) {
Cache::put($k, $cnt, $this->C['hard_timeout']);
$this->iStat('registered');
return;
}
// CAPTCHA ZONE - save data for a long time
Cache::put($k, $cnt, $this->C['captcha_timeout'] ?? self::$DEFAULT['captcha_timeout']);
// User Limits to Captcha
if ($cnt > $this->C['captcha_limit']) {
if ($this->check() == 'allow') {
$this->iStat('allow');
return;
}
// no capcha for blacklisted
if ($netName = \utils\IP::classifyIP(ip(), "blacklist")) {
$this->iStat('blacklist');
$this->captcha_blacklist_flag = 1;
$this->_reportCheck(ip(), true, $netName);
return "block";
}
$this->iStat('captcha');
header('HTTP/1.1 429 Too Many Requests');
return "captcha";
}
// HARD ZONE - save data for a long time
Cache::put($k, $cnt, $this->C['hard_timeout']);
// OUT OF HARD ZONE
if ($cnt > $this->C['hard_limit']) {
if ($this->check() == 'allow') {
$this->iStat('allow');
return;
}
$this->alert($cnt);
return "block";
}
$this->iStat('passed');
}
// Notify Admins about Blocking
PUBLIC function alert($hits) { #
$ip = ip();
$net = $ip & 0xffffff00; // Class C NET
// One alert per network per 10 hours
if (! once("anti-spider:netblock-alert".self::$generation.":$net", 36000))
return;
$hip = long2ip($ip);
$host = gethostbyaddr($hip);
$reverse_ip = gethostbyname($host);
$agent = $_SERVER["HTTP_USER_AGENT"];
\Log::warning("AntiSpider block alert:\n ip:$hip ($agent) host: $host host_ip: $reverse_ip cumulative-hits: $hits");
Alert::send(/*"odessa",*/ "parf@difive.com",
$_SERVER['HTTP_HOST']." i('anti-spider')",
"ip:$hip ($agent) host: $host host_ip: $reverse_ip cumulative-hits: $hits\nlast-rq-uri: ".$_SERVER["REQUEST_URI"]);
}
// -----------------------------------------
//
// INTERNAL
//
function __construct($p=[]) {
$c = NVL(CC("antispider"), []);
$this->C = $p + $c + self::$DEFAULT;
}
function iStat(/*array*/ $what) {
if (! is_array($what))
$what = [$what => 1];
i('Stat', 'anti-spider')->hit($what);
}
// check vs known spider and known offender list
// called when soft-limit is reached
// - allow -
function check() { # allow | block
// CACHE
$ip = ip();
$net = $ip & 0xFFFF_FF00;
$k ="anti-spider".self::$generation."-check".$net;
if ($r = Cache::get($k))
return $r;
// calculated once an hour / NET - based on ONE IP
if ($rz = $this->isGoodSpider($ip)) {
$this->iStat('good-spider');
$r = "allow";
$this->_reportCheck($ip, false, $rz);
} else {
// blocking
$this->iStat('block');
$r = "block";
$this->_reportCheck($ip, true, $rz);
}
Cache::put($k, $r, 3600 * 5); // checking once / 5*hour
return $r;
}
/**
* send alerts about blocking to logs / admins
* internal, called from check()
* @param int $ip
*/
function _reportCheck(int $ip, bool $is_block, string $netName) {
if (! $is_block && $netName == 'google-ip')
return;
if (! $is_block) {
// report once/8hour for every goodSpider
if ($cnt = \Cache::onceCount("AS:_reportCheck/$netName", 3600*8)) {
if ($cnt < 3)
return; // less spam
$text = "Antispider Allow *$netName* cnt=$cnt";
\HB::slackPost([
'channel' => '#prod-alerts',
'text' => $text
]);
} else {
return; // skip reporting
}
}
$op = $is_block ? "`block`" : "allow $netName";
$s_ip = long2ip($ip);
// less spam from Blacklisted IPs
if ($is_block && ($blackName = \utils\IP::classifyIP($ip, "blacklist"))) {
$op = "blacklisted";
// report once/8hour
if ($cnt = \Cache::onceCount("AS:_reportCheck/$blackName", 3600*8)) {
if ($cnt < 4)
return; // no spam
$text = "Antispider _blacklisted_ *$blackName* cnt=$cnt (once/8hr)";
\HB::slackPost([
'channel' => '#prod-alerts',
'text' => $text
]);
} else {
return;
}
}
// skip duplicates
$net = $ip & 0xFFFF_FF00;
$netCnt = \Cache::onceCount("AS:_reportCheck:net".$net, 3600*24*5);
if (! $netCnt)
return;
// ---- less spam for known spiders
$UA = $_SERVER['HTTP_USER_AGENT'] ?? "";
if ($is_block) {
// some bad-spiders masks themself as Yahoo Slurp - thats why its here
if (strpos($UA, "admantx.com") || strpos($UA, "YandexBot") || strpos($UA, "Slurp"))
return;
}
// per country stats - excluding blacklisted IPs
$cn = \GeoIP::country($s_ip);
if ($cn !== 'US') {
// log $cn summary, once in a hour
if ($cnt = \Cache::onceCount("_reportCheck-$cn", 3600*8)) {
if ($cnt < 4)
return; // no spam
$text = "Antispider(_".$op."_): country=*$cn block* count=$cnt";
\HB::slackPost([
'channel' => '#prod-alerts',
'text' => $text
]);
i('log')->notice($text);
} else {
return;
}
}
$host = gethostbyaddr($s_ip);
if ($host == $s_ip)
$host = "";
// filter out / ignore known offenders
// too many ips to exclude them all
if ($is_block && preg_match("!amazonaws.com$!", $host))
return;
//
$netInfo = \utils\IP::netInfo($ip);
if ($is_block && strpos($netInfo['org']??"", "Azure Cloud"))
return;
if ($is_block && strpos($netInfo['isp']??"", "Hetzner"))
return;
if ($netCnt == 1)
$netCnt = "";
$text = "Antispider(_".$op."_): $s_ip ($cn)".cs(", host: %s", $host).cs(", net-cnt: %d", $netCnt).
"\n\tagent: ".$UA.
"\n\t".x2s($netInfo);
$site = $_SERVER['HTTP_X_HOST'] ?? $_SERVER['HTTP_HOST'] ?? "unknown-host";
$page = $_SERVER['HTTP_X_URI'] ?? $_SERVER['REQUEST_URI'] ?? "/unknown-page";
\HB::slackPost([
'channel' => '#prod-alerts',
'text' => $text." @ $site$page",
"icon_emoji" => $op == 'block' ? ":heavy_exclamation_mark:" : ":information_source:"
]);
i('log')->notice(str_replace("\t", " ", $text));
}
/**
* Is Good IP - good spiders, commonly used good proxies, our IPs
* called once an hour per net(ip)
*
* @param string|int $ip IP
* @return string goodIP_NetName
*
* examples / tests:
* i("anti-spider")->isGoodSpider("172.32.0.1")
* t-mobile
* i("anti-spider")->isGoodSpider("66.249.66.1")
* google-ip
*/
function isGoodSpider($ip) : string { # isGoodIP network name
return \utils\IP::classifyIP($ip);
}
/**
* Asks google.com is the answer to recaptcha really valid
* @param string $recaptcha_response
* @param int $version - re-captcha version 2|3
* @param float $score_threshold - For recaptcha V3 in additional to $data["success"] response's $data["score"] should be more or equal to $score_threshold to pass validation
* @return boolean
*/
public function validateUserResponse($recaptcha_response, $version = 2, float $score_threshold = 1.0) {
if ($recaptcha_response) {
try {
if (3 == $version) {
$secret = C("antispider.v3.google_recaptcha_secret");
} else {
$secret = NVL(CC("antispider.v2.google_recaptcha_secret"), CC("antispider.google_recaptcha_secret"));
}
if ($res = Curl::post($this->C['google_recaptcha_url'], ["secret" => $secret, "response" => $recaptcha_response, "remoteip" => ip()])) {
$data = json_decode($res, true);
if ($data["success"] == true) {
if (isset($data["score"])) {
//V3 extended logic
if ((float) $data["score"] >= $score_threshold) {
return true;
} else {
return false;
}
}
return true;
}
}
} catch (Exception $e) {
}
}
return false;
}
/**
* Wrapper around validateUserResponse(), performs network amnesty if captcha successfully passed
* @param $recaptcha_response
* @param int $version
* @param float $score_threshold
* @return bool
*/
public function verifyCaptcha($recaptcha_response, $version = 2, float $score_threshold = 1.0) {
if(Debug::is_admin()) {
return true;
}
if ($this->validateUserResponse($recaptcha_response, $version, $score_threshold)) {
//amnesty network
$k = $this->hitCounterKey();
Cache::delete($k);
$this->iStat('verified');
return true;
}
return false;
}
/**
* @param $ip string|int - 0 = use current visitor IP
* check IP vs known google ips from https://www.gstatic.com/ipranges/goog.json
*/
static function isGoogleIP(/* int|string */ $ip) : bool {
return \utils\IP::isGoogleIP($ip);
}
}