Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add spider RequestDeduplicationMiddleware #261

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions src/Spider/Middleware/RequestDeduplicationMiddleware.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
<?php

namespace RoachPHP\Spider\Middleware;

use Psr\Log\LoggerInterface;
use RoachPHP\Http\Request;
use RoachPHP\Http\Response;
use RoachPHP\Support\Configurable;

/**
* Drop duplicate requests to avoid unnecessary work.
*
* This middleware is optimized for reducing memory usage as it drops duplicate
* requests early in the request processing pipeline, and it uses a limited-size
* cache to store seen URIs.
*/
class RequestDeduplicationMiddleware implements RequestMiddlewareInterface
{
use Configurable;

private array $seenUriHashesHits = [];
private int $requestCount = 0;
private int $requestDroppedCount = 0;

public function __construct(private LoggerInterface $logger)
{
}

private function defaultOptions(): array
{
return [
'ignore_url_fragments' => false,
'ignore_trailing_slashes' => true,
'ignore_query_string' => false,
'seen_uris_cache_size' => 10000,
];
}

public function handleRequest(Request $request, Response $response): Request
{
$uri = $request->getUri();

if ($this->isDuplicatedUri($uri)) {
$this->logger->info(
'[RequestDeduplicationMiddleware] Dropping duplicate request',
['uri' => $uri],
);

return $request->drop('Duplicate request');
}

return $request;
}

private function isDuplicatedUri(string $uri): bool
{
$uriHash = $this->hashUri($uri);

if (isset($this->seenUriHashesHits[$uriHash])) {
$this->seenUriHashesHits[$uriHash] += 1;
return true;
}

$this->seenUriHashesHits[$uriHash] = 1;
$this->cacheEviction();
return false;
}

private function hashUri(string $uri): string
{
$replaceFlags = HTTP_URL_REPLACE;
$parts = parse_url($uri);

if ($this->option('ignore_url_fragments')) {
$replaceFlags |= HTTP_URL_STRIP_FRAGMENT;
}

if ($this->option('ignore_trailing_slashes') && isset($parts['path'])) {
$parts['path'] = rtrim($parts['path'], '/');
}

if ($this->option('ignore_query_string')) {
$replaceFlags |= HTTP_URL_STRIP_QUERY;
}

$uri = http_build_url($uri, $parts, $replaceFlags);
return md5($uri); // Why md5? Because it's fast and short.
}

private function cacheEviction(): void
{
if (count($this->seenUriHashesHits) <= $this->option('seen_uris_cache_size')) {
return;
}

$averageHitCount = array_sum($this->seenUriHashesHits) / count($this->seenUriHashesHits);
$this->seenUriHashesHits = array_filter($this->seenUriHashesHits, fn($hitCount) => $hitCount > $averageHitCount);

$this->logger->info(
'[RequestDeduplicationMiddleware] Cache eviction',
[
'average_hit_count' => $averageHitCount,
'remaining_cache_size' => count($this->seenUriHashesHits),
],
);
}
}
80 changes: 80 additions & 0 deletions tests/Spider/Middleware/RequestDeduplicationMiddlewareTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
<?php

declare(strict_types=1);

namespace RoachPHP\Tests\Spider\Middleware;

use PHPUnit\Framework\TestCase;
use RoachPHP\Spider\Middleware\RequestDeduplicationMiddleware;
use RoachPHP\Testing\Concerns\InteractsWithRequestsAndResponses;
use RoachPHP\Testing\FakeLogger;

/**
* @internal
*/
final class RequestDeduplicationMiddlewareTest extends TestCase
{
use InteractsWithRequestsAndResponses;

public function testDropDuplicatedRequest(): void
{
$uri = 'http://localhost/';
$middleware = $this
->createMiddleware();

$processedRequest = $middleware
->handleRequest($this->makeRequest($uri), $this->makeResponse());
self::assertSame(false, $processedRequest->wasDropped());

$processedRequest = $middleware
->handleRequest($this->makeRequest($uri), $this->makeResponse());
self::assertSame(true, $processedRequest->wasDropped());
}

public function testCacheEviction(): void
{
$uri_a = 'http://localhost/a';
$uri_b = 'http://localhost/b';
$uri_c = 'http://localhost/c';
$middleware = $this
->createMiddleware(2);

foreach(range(1, 3) as $index) {
$processedRequest = $middleware
->handleRequest($this->makeRequest($uri_a), $this->makeResponse());
}
$processedRequest = $middleware
->handleRequest($this->makeRequest($uri_b), $this->makeResponse());

$processedRequest = $middleware
->handleRequest($this->makeRequest($uri_a), $this->makeResponse());
self::assertSame(true, $processedRequest->wasDropped());

$processedRequest = $middleware
->handleRequest($this->makeRequest($uri_b), $this->makeResponse());
self::assertSame(true, $processedRequest->wasDropped());

$processedRequest = $middleware
->handleRequest($this->makeRequest($uri_c), $this->makeResponse());
self::assertSame(false, $processedRequest->wasDropped()); // It needs the list of accessed URIs delete some entries.

$processedRequest = $middleware
->handleRequest($this->makeRequest($uri_b), $this->makeResponse());
self::assertSame(false, $processedRequest->wasDropped()); // B was removed from list of duplicated requests.

$processedRequest = $middleware
->handleRequest($this->makeRequest($uri_a), $this->makeResponse());
self::assertSame(true, $processedRequest->wasDropped()); // A was not removed as it was requested more times than the others.
}

private function createMiddleware(?int $cacheSize = null): RequestDeduplicationMiddleware
{
$middleware = new RequestDeduplicationMiddleware(new FakeLogger());

if (null !== $cacheSize) {
$middleware->configure(['seen_uris_cache_size' => $cacheSize]);
}

return $middleware;
}
}