From d52aee041fdeb2caa8cae2b7075b594b1b11be1a Mon Sep 17 00:00:00 2001
From: Lukas Niebler <58593037+lukasniebler@users.noreply.github.com>
Date: Fri, 10 Jan 2025 14:05:03 +0100
Subject: [PATCH] feat(data.php) - Add more safeguards
Adds more checks to avoid too large datasets.
---
includes/Analytics.php | 11 +-
includes/Dashboard.php | 2 +-
includes/Data.php | 259 +++++++++++++++++++++++++++--------------
includes/Language.php | 2 +-
rrze-statistik.php | 2 +-
5 files changed, 182 insertions(+), 94 deletions(-)
diff --git a/includes/Analytics.php b/includes/Analytics.php
index f110e4a..b7373d9 100644
--- a/includes/Analytics.php
+++ b/includes/Analytics.php
@@ -40,7 +40,7 @@ public static function retrieveSiteUrl($type)
} else if ($type === 'logs') {
$output = 'https://statistiken.rrze.fau.de/webauftritte/logs/' . $url;
} else {
- $output = 'https://statistiken.rrze.fau.de/webauftritte/logs/' . $url . '/url_' . Self::getDate() . '.tab';
+ $output = 'https://statistiken.rrze.fau.de/webauftritte/logs/' . $url . '/url_' . self::getDate() . '.tab';
}
return $output;
}
@@ -55,7 +55,7 @@ public function getLinechart($container)
{
$remove_char = ["https://", "http://", "/"];
$site = str_replace($remove_char, "", get_site_url());
- $ready_check = Data::processLinechartDataset(Self::retrieveSiteUrl('webalizer.hist'));
+ $ready_check = Data::processLinechartDataset(self::retrieveSiteUrl('webalizer.hist'));
if ($ready_check === false) {
return printf(__('It might take a few days until personal statistics for your website ( %1$s ) are displayed within your dashboard.', 'rrze-statistik'), $site) . '
';
} else {
@@ -102,27 +102,28 @@ public static function getUrlDatasetTable()
{
$output = ''; // Initialize $output
$data = get_transient('rrze_statistik_data_url');
+
if (!$data) {
return __('It might take a few weeks until the summary is displayed on your dashboard.', 'rrze-statistik') . '
';
} else {
if (array_key_exists(0, $data)) {
$top_url = $data[0];
if (!empty($top_url)) {
- $table1 = Self::getTwoDimensionalHtmlTable($top_url, 0, 1, __('Hits', 'rrze-statistik'), __('Sites', 'rrze-statistik'));
+ $table1 = self::getTwoDimensionalHtmlTable($top_url, 0, 1, __('Hits', 'rrze-statistik'), __('Sites', 'rrze-statistik'));
$output = $table1;
}
}
if (array_key_exists(1, $data)) {
$top_images = $data[1];
if (!empty($top_images)) {
- $table2 = Self::getTwoDimensionalHtmlTable($top_images, 0, 1, __('Hits', 'rrze-statistik'), __('Media', 'rrze-statistik'));
+ $table2 = self::getTwoDimensionalHtmlTable($top_images, 0, 1, __('Hits', 'rrze-statistik'), __('Media', 'rrze-statistik'));
$output .= $table2;
}
}
if (array_key_exists(2, $data)) {
$top_pdf = $data[2];
if (!empty($top_pdf)) {
- $table3 = Self::getTwoDimensionalHtmlTable($top_pdf, 0, 1, __('Hits', 'rrze-statistik'), __('Documents', 'rrze-statistik'));
+ $table3 = self::getTwoDimensionalHtmlTable($top_pdf, 0, 1, __('Hits', 'rrze-statistik'), __('Documents', 'rrze-statistik'));
$output .= $table3;
}
}
diff --git a/includes/Dashboard.php b/includes/Dashboard.php
index 453d959..12ee19e 100644
--- a/includes/Dashboard.php
+++ b/includes/Dashboard.php
@@ -15,7 +15,7 @@ public function __construct()
add_action('wp_ajax_widgetsave', [$this, 'rrze_statistik_save_widget']);
add_action('wp_ajax_showform', [$this, 'rrze_statistik_ajax_show_form']);
- Self::prefill_options();
+ self::prefill_options();
}
/**
diff --git a/includes/Data.php b/includes/Data.php
index 8e4efa0..5806562 100644
--- a/includes/Data.php
+++ b/includes/Data.php
@@ -21,10 +21,10 @@ public static function setTransients()
$screen = get_current_screen();
if ($screen->id == "dashboard") {
if (!get_transient('rrze_statistik_data_webalizer_hist')) {
- Self::updateData();
+ self::updateData();
}
if (!get_transient('rrze_statistik_data_url')) {
- Self::updateUrlData();
+ self::updateUrlData();
}
}
}
@@ -40,12 +40,12 @@ public static function updateData()
{
// Fetch Dataset Webalizer.hist
$url = Analytics::retrieveSiteUrl('webalizer.hist');
- $data_body = Self::fetchDataBody($url);
- $validation = Self::validateData($data_body);
+ $data_body = self::fetchDataBody($url);
+ $validation = self::validateData($data_body);
if ($validation === false) {
return false;
} else {
- $data = Self::processDataBody($data_body);
+ $data = self::processDataBody($data_body);
array_pop($data);
set_transient('rrze_statistik_data_webalizer_hist', $data, 6 * HOUR_IN_SECONDS);
return true;
@@ -61,31 +61,69 @@ public static function updateUrlData()
{
// Fetch Dataset
$url = Analytics::retrieveSiteUrl('url');
- $data_body = Self::fetchDataBody($url);
- $validation = Self::validateData($data_body);
+ $data_body = self::fetchDataBody($url);
+ $validation = self::validateData($data_body);
if ($validation === false) {
return false;
} else {
$data = substr($data_body, 0, 9999);
- $processed_data = Self::processUrlDataBody($data);
+ $processed_data = self::processUrlDataBody($data);
set_transient('rrze_statistik_data_url', $processed_data, 12 * HOUR_IN_SECONDS);
return true;
}
}
/**
- * Fetches body $url from statistiken.rrze.fau.de
+ * Fetches body $url from statistiken.rrze.fau.de and aborts if the file is too large
*
* @param string $url
- * @return string
+ * @return string|false
*/
public static function fetchDataBody($url)
{
- $cachable = wp_remote_get(esc_url_raw($url));
- $cachable_body = wp_remote_retrieve_body($cachable);
- return $cachable_body;
+ $response = wp_remote_head($url); // Initial HEAD-Request, um die Header zu prüfen
+ if (is_wp_error($response)) {
+ Helper::debug('RRZE Statistik | Failed to fetch headers: ' . $response->get_error_message());
+ return false;
+ }
+
+ // Check the HTTP Status Code
+ $status_code = wp_remote_retrieve_response_code($response);
+ if ($status_code == 403) {
+ Helper::debug('RRZE Statistik | HTTP 403 Forbidden: Access denied.');
+ return false;
+ } elseif ($status_code == 404) {
+ Helper::debug('RRZE Statistik | HTTP 404 Not Found: Resource unavailable.');
+ return false;
+ } elseif ($status_code >= 400) {
+ Helper::debug('RRZE Statistik | HTTP Error ' . $status_code . ': Aborting fetch.');
+ return false;
+ }
+
+ // Check the Data Size
+ $content_length = wp_remote_retrieve_header($response, 'content-length');
+ Helper::debug(wp_remote_retrieve_response_code($response));
+ if ($content_length && $content_length > 12 * 1024 * 1024) { // Set Size Limit
+ Helper::debug('RRZE Statistik | File size exceeds the 12 MB limit. Aborting fetch.');
+ return false;
+ }
+
+ // Only Fetch Content, if Size Limit of 12 MB is not exceeded
+ $response = wp_remote_get(esc_url_raw($url), ['timeout' => 10]); // Set Timeout
+ if (is_wp_error($response)) {
+ Helper::debug('RRZE Statistik | Failed to fetch data: ' . $response->get_error_message());
+ return false;
+ }
+
+ $body = wp_remote_retrieve_body($response);
+ if (empty($body)) {
+ Helper::debug('rrze.log.error', 'RRZE Statistik | Empty response body.');
+ return false;
+ }
+
+ return $body;
}
/**
@@ -119,6 +157,13 @@ public static function validateData($data_body)
}
}
+ private static function generateRows($data_trim)
+{
+ foreach (explode("\n", $data_trim) as $row) {
+ yield $row;
+ }
+}
+
/**
* Converts .tab separated data in associative array for later use. in Cronjob, weekly
*
@@ -127,84 +172,126 @@ public static function validateData($data_body)
*/
public static function processUrlDataBody($data_body)
{
+ // Trim and split the input data
$data_trim = rtrim($data_body, " \n\r\t\v");
- $array = preg_split("/\r\n|\n|\r/", $data_trim);
- $image_files = [];
+ if (empty($data_trim)) {
+ return [[], [], []]; // Return empty arrays if no data
+ }
+ $rows = explode("\n", $data_trim);
+
+ // Precompile patterns for better performance
+ $ignore_patterns = [
+ "wp-includes",
+ "wp-content",
+ "wp-json",
+ "wp-admin",
+ "robots",
+ "xml",
+ "module.php",
+ ".css",
+ ".js",
+ ".json",
+ "/feed"
+ ];
+
+ $image_patterns = [
+ ".jpg",
+ ".jpeg",
+ ".gif",
+ ".png",
+ ".svg",
+ ".webp",
+ ".ico",
+ ".bmp",
+ ".tiff",
+ ".tif",
+ ".psd",
+ ".ai",
+ ".eps"
+ ];
+
+ $document_patterns = [
+ ".pdf",
+ ".docx",
+ ".ppt",
+ ".pptx",
+ ".xls",
+ ".xlsx",
+ ".doc",
+ ".zip",
+ ".rar"
+ ];
+
+ // Initialize result arrays
$sites = [];
+ $image_files = [];
$pdf_files = [];
- $output = [];
- foreach ($array as $value) {
- $array_splitted = preg_split("/ |( )/", $value);
- \array_splice($array_splitted, 1, -1);
- //Following file extensions are ignored
- if(isset($array_splitted[1])){
- if (
- strpos($array_splitted[1], "wp-includes") !== false
- || strpos($array_splitted[1], "wp-content") !== false
- || strpos($array_splitted[1], "wp-json") !== false
- || strpos($array_splitted[1], "wp-admin") !== false
- || strpos($array_splitted[1], "robots") !== false
- || strpos($array_splitted[1], "xml") !== false
- || strpos($array_splitted[1], "module.php") !== false
- || strpos($array_splitted[1], ".css") !== false
- || strpos($array_splitted[1], ".js") !== false
- || strpos($array_splitted[1], ".json") !== false
- || strpos($array_splitted[1], "/feed") !== false
- ) {
- //Following file extensions are listed below sites in Dashboard
- } elseif (
- strpos($array_splitted[1], ".jpg") !== false
- || strpos($array_splitted[1], ".jpeg") !== false
- || strpos($array_splitted[1], ".gif") !== false
- || strpos($array_splitted[1], ".png") !== false
- || strpos($array_splitted[1], ".svg") !== false
- || strpos($array_splitted[1], ".webp") !== false
- || strpos($array_splitted[1], ".ico") !== false
- || strpos($array_splitted[1], ".bmp") !== false
- || strpos($array_splitted[1], ".tiff") !== false
- || strpos($array_splitted[1], ".tif") !== false
- || strpos($array_splitted[1], ".psd") !== false
- || strpos($array_splitted[1], ".ai") !== false
- || strpos($array_splitted[1], ".eps") !== false
- ) {
- array_push($image_files, $array_splitted);
- //Following file extensions are listed below documents in Dashboard
- } elseif (
- strpos($array_splitted[1], ".pdf") !== false
- || strpos($array_splitted[1], ".docx") !== false
- || strpos($array_splitted[1], ".ppt") !== false
- || strpos($array_splitted[1], ".pptx") !== false
- || strpos($array_splitted[1], ".xls") !== false
- || strpos($array_splitted[1], ".xlsx") !== false
- || strpos($array_splitted[1], ".doc") !== false
- || strpos($array_splitted[1], ".zip") !== false
- || strpos($array_splitted[1], ".rar") !== false
-
- ) {
- array_push($pdf_files, $array_splitted);
+ // Set iteration and time limits
+ $max_iterations = 10000; // Maximum rows to process
+ $start_time = microtime(true);
+ $time_limit = 5; // Max processing time in seconds
+
+ $index = 0;
+ foreach (self::generateRows($data_trim) as $row) {
+ $index++;
+ // Stop processing if iteration or time limits are exceeded
+ if ($index >= $max_iterations || (microtime(true) - $start_time) > $time_limit) {
+ break;
+ }
+
+ $columns = preg_split("/\t+/", $row, -1, PREG_SPLIT_NO_EMPTY);
+ unset($columns[4]);
+ unset($columns[3]);
+ unset($columns[2]);
+ unset($columns[1]);
+ $columns = array_values($columns);
+ if (count($columns) < 2) {
+ continue;
+ }
+
+ $url = $columns[1];
+
+ // Check if the URL matches any ignore patterns
+ if (self::matchesAnyPattern($url, $ignore_patterns)) {
+ continue;
+ }
+
+ // Categorize the URL into images, documents, or other sites
+ if (self::matchesAnyPattern($url, $image_patterns)) {
+ $image_files[] = $columns;
+ } elseif (self::matchesAnyPattern($url, $document_patterns)) {
+ $pdf_files[] = $columns;
} else {
- array_push($sites, $array_splitted);
+ $sites[] = $columns;
}
- }
- }
- //if last array item has no trailing slash, remove it
- if (!empty($sites) && substr($sites[count($sites) - 1][1], -1) !== "/") {
- array_pop($sites);
- }
- array_pop($image_files);
- array_pop($pdf_files);
+ // Stop processing if enough items are collected
+ if (count($sites) >= 10 && count($image_files) >= 10 && count($pdf_files) >= 5) {
+ break;
+ }
+ }
- //check if value isNull
- is_null($sites) ? $sites = [] : $sites;
- is_null($image_files) ? $image_files = [] : $image_files;
- is_null($pdf_files) ? $pdf_files = [] : $pdf_files;
+ return [
+ array_slice($sites, 0, 10),
+ array_slice($image_files, 0, 10),
+ array_slice($pdf_files, 0, 5),
+ ];
+ }
- $output = [array_slice($sites, 0, 10), array_slice($image_files, 0, 10), array_slice($pdf_files, 0, 5)];
- return ($output);
+ /**
+ * Checks if a string matches any pattern in the given array
+ *
+ * @param string $text
+ * @param array $patterns
+ * @return bool
+ */
+ private static function matchesAnyPattern($text, $patterns)
+ {
+ return array_filter($patterns, fn($pattern) => strpos($text, $pattern) !== false);
}
+
/**
* Transforms webalizer.hist into Array and keys it with associated keymap.
*
@@ -225,26 +312,26 @@ public static function processDataBody($data_body)
'pages',
'visits',
);
- $data_trim = rtrim($data_body, " \n\r\t\v");
+ $data_trim = trim($data_body, " \n\r\t\v");
$array = preg_split("/\r\n|\n|\r/", $data_trim);
$output = [];
// Helper::debug($data_body);
-
+
foreach ($array as $value) {
$splittedValue = preg_split("/ /", $value);
-
+
if (count($keymap) !== count($splittedValue)) {
Helper::debug('RRZE Statistik | Statistiken.rrze.fau.de fetch response body: The server is temporarily unable to service your request due to maintenance downtime or capacity problems. Please try again later.');
continue;
}
-
+
array_push($output, array_combine($keymap, $splittedValue));
}
-
+
return $output;
}
-
+
/**
* Uses a set of functions to fetch webalizer.hist, process the data, set description
diff --git a/includes/Language.php b/includes/Language.php
index 0fc4721..36f55cd 100644
--- a/includes/Language.php
+++ b/includes/Language.php
@@ -29,7 +29,7 @@ public static function getMonthDesc()
public static function getAbscissa()
{
- $abscissa_desc = Self::getMonthDesc();
+ $abscissa_desc = self::getMonthDesc();
return $abscissa_desc;
}
diff --git a/rrze-statistik.php b/rrze-statistik.php
index 96285d8..f513b79 100644
--- a/rrze-statistik.php
+++ b/rrze-statistik.php
@@ -4,7 +4,7 @@
Plugin Name: RRZE Statistics
Plugin URI: https://github.com/rrze-webteam/rrze-statistik
Description: Displays monthly statistics from https://statistiken.rrze.fau.de within your Dashboard.
-Version: 1.1.14
+Version: 1.1.15
Author: RRZE Webteam
Author URI: https://blogs.fau.de/webworking/
License: GNU General Public License v2