From d52aee041fdeb2caa8cae2b7075b594b1b11be1a Mon Sep 17 00:00:00 2001 From: Lukas Niebler <58593037+lukasniebler@users.noreply.github.com> Date: Fri, 10 Jan 2025 14:05:03 +0100 Subject: [PATCH] feat(data.php) - Add more safeguards Adds more checks to avoid too large datasets. --- includes/Analytics.php | 11 +- includes/Dashboard.php | 2 +- includes/Data.php | 259 +++++++++++++++++++++++++++-------------- includes/Language.php | 2 +- rrze-statistik.php | 2 +- 5 files changed, 182 insertions(+), 94 deletions(-) diff --git a/includes/Analytics.php b/includes/Analytics.php index f110e4a..b7373d9 100644 --- a/includes/Analytics.php +++ b/includes/Analytics.php @@ -40,7 +40,7 @@ public static function retrieveSiteUrl($type) } else if ($type === 'logs') { $output = 'https://statistiken.rrze.fau.de/webauftritte/logs/' . $url; } else { - $output = 'https://statistiken.rrze.fau.de/webauftritte/logs/' . $url . '/url_' . Self::getDate() . '.tab'; + $output = 'https://statistiken.rrze.fau.de/webauftritte/logs/' . $url . '/url_' . self::getDate() . '.tab'; } return $output; } @@ -55,7 +55,7 @@ public function getLinechart($container) { $remove_char = ["https://", "http://", "/"]; $site = str_replace($remove_char, "", get_site_url()); - $ready_check = Data::processLinechartDataset(Self::retrieveSiteUrl('webalizer.hist')); + $ready_check = Data::processLinechartDataset(self::retrieveSiteUrl('webalizer.hist')); if ($ready_check === false) { return printf(__('It might take a few days until personal statistics for your website ( %1$s ) are displayed within your dashboard.', 'rrze-statistik'), $site) . '
'; } else { @@ -102,27 +102,28 @@ public static function getUrlDatasetTable() { $output = ''; // Initialize $output $data = get_transient('rrze_statistik_data_url'); + if (!$data) { return __('It might take a few weeks until the summary is displayed on your dashboard.', 'rrze-statistik') . '
'; } else { if (array_key_exists(0, $data)) { $top_url = $data[0]; if (!empty($top_url)) { - $table1 = Self::getTwoDimensionalHtmlTable($top_url, 0, 1, __('Hits', 'rrze-statistik'), __('Sites', 'rrze-statistik')); + $table1 = self::getTwoDimensionalHtmlTable($top_url, 0, 1, __('Hits', 'rrze-statistik'), __('Sites', 'rrze-statistik')); $output = $table1; } } if (array_key_exists(1, $data)) { $top_images = $data[1]; if (!empty($top_images)) { - $table2 = Self::getTwoDimensionalHtmlTable($top_images, 0, 1, __('Hits', 'rrze-statistik'), __('Media', 'rrze-statistik')); + $table2 = self::getTwoDimensionalHtmlTable($top_images, 0, 1, __('Hits', 'rrze-statistik'), __('Media', 'rrze-statistik')); $output .= $table2; } } if (array_key_exists(2, $data)) { $top_pdf = $data[2]; if (!empty($top_pdf)) { - $table3 = Self::getTwoDimensionalHtmlTable($top_pdf, 0, 1, __('Hits', 'rrze-statistik'), __('Documents', 'rrze-statistik')); + $table3 = self::getTwoDimensionalHtmlTable($top_pdf, 0, 1, __('Hits', 'rrze-statistik'), __('Documents', 'rrze-statistik')); $output .= $table3; } } diff --git a/includes/Dashboard.php b/includes/Dashboard.php index 453d959..12ee19e 100644 --- a/includes/Dashboard.php +++ b/includes/Dashboard.php @@ -15,7 +15,7 @@ public function __construct() add_action('wp_ajax_widgetsave', [$this, 'rrze_statistik_save_widget']); add_action('wp_ajax_showform', [$this, 'rrze_statistik_ajax_show_form']); - Self::prefill_options(); + self::prefill_options(); } /** diff --git a/includes/Data.php b/includes/Data.php index 8e4efa0..5806562 100644 --- a/includes/Data.php +++ b/includes/Data.php @@ -21,10 +21,10 @@ public static function setTransients() $screen = get_current_screen(); if ($screen->id == "dashboard") { if (!get_transient('rrze_statistik_data_webalizer_hist')) { - Self::updateData(); + self::updateData(); } if (!get_transient('rrze_statistik_data_url')) { - Self::updateUrlData(); + self::updateUrlData(); } } } @@ -40,12 +40,12 @@ public static function updateData() { // Fetch Dataset Webalizer.hist $url = Analytics::retrieveSiteUrl('webalizer.hist'); - $data_body = Self::fetchDataBody($url); - $validation = Self::validateData($data_body); + $data_body = self::fetchDataBody($url); + $validation = self::validateData($data_body); if ($validation === false) { return false; } else { - $data = Self::processDataBody($data_body); + $data = self::processDataBody($data_body); array_pop($data); set_transient('rrze_statistik_data_webalizer_hist', $data, 6 * HOUR_IN_SECONDS); return true; @@ -61,31 +61,69 @@ public static function updateUrlData() { // Fetch Dataset $url = Analytics::retrieveSiteUrl('url'); - $data_body = Self::fetchDataBody($url); - $validation = Self::validateData($data_body); + $data_body = self::fetchDataBody($url); + $validation = self::validateData($data_body); if ($validation === false) { return false; } else { $data = substr($data_body, 0, 9999); - $processed_data = Self::processUrlDataBody($data); + $processed_data = self::processUrlDataBody($data); set_transient('rrze_statistik_data_url', $processed_data, 12 * HOUR_IN_SECONDS); return true; } } /** - * Fetches body $url from statistiken.rrze.fau.de + * Fetches body $url from statistiken.rrze.fau.de and aborts if the file is too large * * @param string $url - * @return string + * @return string|false */ public static function fetchDataBody($url) { - $cachable = wp_remote_get(esc_url_raw($url)); - $cachable_body = wp_remote_retrieve_body($cachable); - return $cachable_body; + $response = wp_remote_head($url); // Initial HEAD-Request, um die Header zu prüfen + if (is_wp_error($response)) { + Helper::debug('RRZE Statistik | Failed to fetch headers: ' . $response->get_error_message()); + return false; + } + + // Check the HTTP Status Code + $status_code = wp_remote_retrieve_response_code($response); + if ($status_code == 403) { + Helper::debug('RRZE Statistik | HTTP 403 Forbidden: Access denied.'); + return false; + } elseif ($status_code == 404) { + Helper::debug('RRZE Statistik | HTTP 404 Not Found: Resource unavailable.'); + return false; + } elseif ($status_code >= 400) { + Helper::debug('RRZE Statistik | HTTP Error ' . $status_code . ': Aborting fetch.'); + return false; + } + + // Check the Data Size + $content_length = wp_remote_retrieve_header($response, 'content-length'); + Helper::debug(wp_remote_retrieve_response_code($response)); + if ($content_length && $content_length > 12 * 1024 * 1024) { // Set Size Limit + Helper::debug('RRZE Statistik | File size exceeds the 12 MB limit. Aborting fetch.'); + return false; + } + + // Only Fetch Content, if Size Limit of 12 MB is not exceeded + $response = wp_remote_get(esc_url_raw($url), ['timeout' => 10]); // Set Timeout + if (is_wp_error($response)) { + Helper::debug('RRZE Statistik | Failed to fetch data: ' . $response->get_error_message()); + return false; + } + + $body = wp_remote_retrieve_body($response); + if (empty($body)) { + Helper::debug('rrze.log.error', 'RRZE Statistik | Empty response body.'); + return false; + } + + return $body; } /** @@ -119,6 +157,13 @@ public static function validateData($data_body) } } + private static function generateRows($data_trim) +{ + foreach (explode("\n", $data_trim) as $row) { + yield $row; + } +} + /** * Converts .tab separated data in associative array for later use. in Cronjob, weekly * @@ -127,84 +172,126 @@ public static function validateData($data_body) */ public static function processUrlDataBody($data_body) { + // Trim and split the input data $data_trim = rtrim($data_body, " \n\r\t\v"); - $array = preg_split("/\r\n|\n|\r/", $data_trim); - $image_files = []; + if (empty($data_trim)) { + return [[], [], []]; // Return empty arrays if no data + } + $rows = explode("\n", $data_trim); + + // Precompile patterns for better performance + $ignore_patterns = [ + "wp-includes", + "wp-content", + "wp-json", + "wp-admin", + "robots", + "xml", + "module.php", + ".css", + ".js", + ".json", + "/feed" + ]; + + $image_patterns = [ + ".jpg", + ".jpeg", + ".gif", + ".png", + ".svg", + ".webp", + ".ico", + ".bmp", + ".tiff", + ".tif", + ".psd", + ".ai", + ".eps" + ]; + + $document_patterns = [ + ".pdf", + ".docx", + ".ppt", + ".pptx", + ".xls", + ".xlsx", + ".doc", + ".zip", + ".rar" + ]; + + // Initialize result arrays $sites = []; + $image_files = []; $pdf_files = []; - $output = []; - foreach ($array as $value) { - $array_splitted = preg_split("/ |( )/", $value); - \array_splice($array_splitted, 1, -1); - //Following file extensions are ignored - if(isset($array_splitted[1])){ - if ( - strpos($array_splitted[1], "wp-includes") !== false - || strpos($array_splitted[1], "wp-content") !== false - || strpos($array_splitted[1], "wp-json") !== false - || strpos($array_splitted[1], "wp-admin") !== false - || strpos($array_splitted[1], "robots") !== false - || strpos($array_splitted[1], "xml") !== false - || strpos($array_splitted[1], "module.php") !== false - || strpos($array_splitted[1], ".css") !== false - || strpos($array_splitted[1], ".js") !== false - || strpos($array_splitted[1], ".json") !== false - || strpos($array_splitted[1], "/feed") !== false - ) { - //Following file extensions are listed below sites in Dashboard - } elseif ( - strpos($array_splitted[1], ".jpg") !== false - || strpos($array_splitted[1], ".jpeg") !== false - || strpos($array_splitted[1], ".gif") !== false - || strpos($array_splitted[1], ".png") !== false - || strpos($array_splitted[1], ".svg") !== false - || strpos($array_splitted[1], ".webp") !== false - || strpos($array_splitted[1], ".ico") !== false - || strpos($array_splitted[1], ".bmp") !== false - || strpos($array_splitted[1], ".tiff") !== false - || strpos($array_splitted[1], ".tif") !== false - || strpos($array_splitted[1], ".psd") !== false - || strpos($array_splitted[1], ".ai") !== false - || strpos($array_splitted[1], ".eps") !== false - ) { - array_push($image_files, $array_splitted); - //Following file extensions are listed below documents in Dashboard - } elseif ( - strpos($array_splitted[1], ".pdf") !== false - || strpos($array_splitted[1], ".docx") !== false - || strpos($array_splitted[1], ".ppt") !== false - || strpos($array_splitted[1], ".pptx") !== false - || strpos($array_splitted[1], ".xls") !== false - || strpos($array_splitted[1], ".xlsx") !== false - || strpos($array_splitted[1], ".doc") !== false - || strpos($array_splitted[1], ".zip") !== false - || strpos($array_splitted[1], ".rar") !== false - - ) { - array_push($pdf_files, $array_splitted); + // Set iteration and time limits + $max_iterations = 10000; // Maximum rows to process + $start_time = microtime(true); + $time_limit = 5; // Max processing time in seconds + + $index = 0; + foreach (self::generateRows($data_trim) as $row) { + $index++; + // Stop processing if iteration or time limits are exceeded + if ($index >= $max_iterations || (microtime(true) - $start_time) > $time_limit) { + break; + } + + $columns = preg_split("/\t+/", $row, -1, PREG_SPLIT_NO_EMPTY); + unset($columns[4]); + unset($columns[3]); + unset($columns[2]); + unset($columns[1]); + $columns = array_values($columns); + if (count($columns) < 2) { + continue; + } + + $url = $columns[1]; + + // Check if the URL matches any ignore patterns + if (self::matchesAnyPattern($url, $ignore_patterns)) { + continue; + } + + // Categorize the URL into images, documents, or other sites + if (self::matchesAnyPattern($url, $image_patterns)) { + $image_files[] = $columns; + } elseif (self::matchesAnyPattern($url, $document_patterns)) { + $pdf_files[] = $columns; } else { - array_push($sites, $array_splitted); + $sites[] = $columns; } - } - } - //if last array item has no trailing slash, remove it - if (!empty($sites) && substr($sites[count($sites) - 1][1], -1) !== "/") { - array_pop($sites); - } - array_pop($image_files); - array_pop($pdf_files); + // Stop processing if enough items are collected + if (count($sites) >= 10 && count($image_files) >= 10 && count($pdf_files) >= 5) { + break; + } + } - //check if value isNull - is_null($sites) ? $sites = [] : $sites; - is_null($image_files) ? $image_files = [] : $image_files; - is_null($pdf_files) ? $pdf_files = [] : $pdf_files; + return [ + array_slice($sites, 0, 10), + array_slice($image_files, 0, 10), + array_slice($pdf_files, 0, 5), + ]; + } - $output = [array_slice($sites, 0, 10), array_slice($image_files, 0, 10), array_slice($pdf_files, 0, 5)]; - return ($output); + /** + * Checks if a string matches any pattern in the given array + * + * @param string $text + * @param array $patterns + * @return bool + */ + private static function matchesAnyPattern($text, $patterns) + { + return array_filter($patterns, fn($pattern) => strpos($text, $pattern) !== false); } + /** * Transforms webalizer.hist into Array and keys it with associated keymap. * @@ -225,26 +312,26 @@ public static function processDataBody($data_body) 'pages', 'visits', ); - $data_trim = rtrim($data_body, " \n\r\t\v"); + $data_trim = trim($data_body, " \n\r\t\v"); $array = preg_split("/\r\n|\n|\r/", $data_trim); $output = []; // Helper::debug($data_body); - + foreach ($array as $value) { $splittedValue = preg_split("/ /", $value); - + if (count($keymap) !== count($splittedValue)) { Helper::debug('RRZE Statistik | Statistiken.rrze.fau.de fetch response body: The server is temporarily unable to service your request due to maintenance downtime or capacity problems. Please try again later.'); continue; } - + array_push($output, array_combine($keymap, $splittedValue)); } - + return $output; } - + /** * Uses a set of functions to fetch webalizer.hist, process the data, set description diff --git a/includes/Language.php b/includes/Language.php index 0fc4721..36f55cd 100644 --- a/includes/Language.php +++ b/includes/Language.php @@ -29,7 +29,7 @@ public static function getMonthDesc() public static function getAbscissa() { - $abscissa_desc = Self::getMonthDesc(); + $abscissa_desc = self::getMonthDesc(); return $abscissa_desc; } diff --git a/rrze-statistik.php b/rrze-statistik.php index 96285d8..f513b79 100644 --- a/rrze-statistik.php +++ b/rrze-statistik.php @@ -4,7 +4,7 @@ Plugin Name: RRZE Statistics Plugin URI: https://github.com/rrze-webteam/rrze-statistik Description: Displays monthly statistics from https://statistiken.rrze.fau.de within your Dashboard. -Version: 1.1.14 +Version: 1.1.15 Author: RRZE Webteam Author URI: https://blogs.fau.de/webworking/ License: GNU General Public License v2