From 33fed566e7b583bbff2de5dc4c13e546e2451b26 Mon Sep 17 00:00:00 2001 From: Joost de Valk Date: Thu, 22 Feb 2024 14:41:01 +0100 Subject: [PATCH] Don't group, as it's not supported by all UAs --- README.md | 40 +--------------------------------------- src/class-plugin.php | 43 ++++++++++++++++++++++++++++++++++--------- 2 files changed, 35 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 639ccec..0df5e2d 100644 --- a/README.md +++ b/README.md @@ -15,45 +15,7 @@ Optimizes your site's robots.txt to reduce server load and CO2 footprint by bloc ## Default output -The default output of this plugin is as follows: - -```txt -# This site is very specific about who it allows crawling from. -# Our default is to not allow crawling: -User-agent: * -Disallow: / - -# Below are the crawlers that are allowed to crawl this site. -# Below that list, you'll find paths that are blocked, even for them, -# and then paths within those blocked paths that are allowed. -User-agent: Applebot -User-agent: ia_archiver -User-agent: Baiduspider -User-agent: Bingbot -User-agent: DuckDuckBot -User-agent: Googlebot -User-agent: AdsBot-Google -User-agent: MediaPartners-Google -User-agent: Yandex -User-agent: Slurp -User-agent: FacebookExternalHit -User-agent: LinkedInBot -User-agent: WhatsApp -User-agent: Twitterbot -Allow: / -Disallow: /wp-json/ -Disallow: /?rest_route= -Disallow: /wp-admin/ -Disallow: /wp-content/cache/ -Disallow: /wp-content/plugins/ -Disallow: /xmlrpc.php -Disallow: /wp-includes/ -Allow: /wp-includes/css/ -Allow: /wp-includes/js/ - -# XML Sitemap: -Sitemap: https://example.com/sitemap_index.xml -``` +The default output of this plugin [can be seen here on joost.blog](https://joost.blog/robots.txt) or [here on emilia.capital](https://emilia.capital/robots.txt). ## Filters diff --git a/src/class-plugin.php b/src/class-plugin.php index 31c47cb..82fc146 100644 --- a/src/class-plugin.php +++ b/src/class-plugin.php @@ -9,6 +9,27 @@ class Plugin { const BACKUP_PATH = ABSPATH . 'robots.txt.eco-friendly-backup'; + /** + * The list of allowed crawlers. + * + * @var string[] + */ + public $allowed_spiders; + + /** + * The list of blocked paths. + * + * @var string[] + */ + public $blocked_paths; + + /** + * The list of allowed paths. + * + * @var string[] + */ + public $allowed_paths; + /** * Initialize the hooks and filters. */ @@ -60,6 +81,11 @@ public function modify_robots_txt( $output, $site_public ) { return "User-agent: *\nDisallow: /\n"; } + // We only need to do this when we're actually sending a robots.txt, hence here. + $this->allowed_spiders = $this->get_allowed_spiders(); + $this->blocked_paths = $this->get_blocked_paths(); + $this->allowed_paths = $this->get_allowed_paths(); + $robots_txt = "# This site is very specific about who it allows crawling from.\n"; $robots_txt .= "# Our default is to not allow crawling:\n"; $robots_txt .= "User-agent: *\n"; @@ -68,17 +94,16 @@ public function modify_robots_txt( $output, $site_public ) { $robots_txt .= "\n# Below are the crawlers that are allowed to crawl this site.\n"; $robots_txt .= "# Below that list, you'll find paths that are blocked, even for them,\n"; $robots_txt .= "# and then paths within those blocked paths that are allowed.\n"; - foreach ( $this->get_allowed_spiders() as $crawler ) { + foreach ( $this->allowed_spiders as $crawler ) { $robots_txt .= "User-agent: $crawler\n"; - } - $robots_txt .= "Allow: /\n"; - - foreach ( $this->get_blocked_paths() as $path ) { - $robots_txt .= "Disallow: $path\n"; - } + $robots_txt .= "Allow: /\n"; + foreach ( $this->blocked_paths as $path ) { + $robots_txt .= "Disallow: $path\n"; + } - foreach ( $this->get_allowed_paths() as $path ) { - $robots_txt .= "Allow: $path\n"; + foreach ( $this->allowed_paths as $path ) { + $robots_txt .= "Allow: $path\n"; + } } // Keep existing Sitemap references.