Skip to content

Latest commit

 

History

History
3750 lines (3336 loc) · 107 KB

README.md

File metadata and controls

3750 lines (3336 loc) · 107 KB

概述

​ 通过机器学习方法对北京地区的空气质量进行分类预测,以期降低监测成本并提升预测精度。构建了两个模型:一是基于随机森林算法的分类预测模型,二是基于 长短期记忆网络(LSTM)的时间序列预测模型。在分类模型的构建过程中,对比了随机森林 算法和支持向量机(SVM)算法,并针对这两种算法进行了细致的参数调优,最终,随机森林 模型在测试集上达到了0.817的准确率和0.815的F1分数,而SVM模型在测试集上准确率和 F1分数均为0.758,随机森林算法因其在测试集上展现出的较高准确率和泛化能力而被选为最终模型。分类模型的创新之处在于减少了对高成本空气污染物数据的依赖,通过更多地利用易于获取的气象数据中的有效特征值进行对空气质量等级的预测,有效地降低了测量成 本。而LSTM模型的构建考虑了时间序列数据的特点,利用历史数据对未来各项空气污染物指 数的数值进行了细致的预测,最终模型的预测精度达到了0.977,充分表现了其在实际应用中 的重要价值。未来可以进一步探索模型优化空间,以期达到更高的预测精度和更好的实际应用效果。

Classification-Algorithm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt   ##绘图库

plt.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体为新宋体。
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像时'-'显示为方块的问题。

from sklearn.model_selection import train_test_split   ## 划分

import torch

from sklearn.model_selection import train_test_split   #训练集测试集划分
from sklearn.ensemble import RandomForestClassifier    #随机森林相关库
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score        #模型精度评分
from sklearn.metrics import confusion_matrix           #混淆矩阵表
from sklearn import svm  #支持向量机

# 检查是否有可用的 GPU
if torch.cuda.is_available():
    device = torch.device("cuda")  # 使用GPU
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")  # 使用 CPU
    print("Using CPU")
Using GPU: NVIDIA GeForce MX450
# 加载 csv文件
merged_data = pd.read_csv('merged_data.csv')

data = merged_data.drop(['AQI','PM2.5','PM10','NO2','CO','SO2','O3_8h'],axis=1)
# 检查结果
data
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
日期 气温 气压 平均海平面大气压 气压趋势 湿度 风向 平均风速 最大阵风 天气情况 最低气温 最高气温 露点温度 降水时间 季节 质量等级
0 2013-12-02 3.162 759.712 764.462 -0.013 45.875 7.375 1.375 7.177 5.500 6.896 18.301 -8.588 10.717 轻度污染
1 2013-12-03 5.488 761.725 766.425 0.100 39.000 8.000 1.625 7.177 4.750 7.209 18.314 -8.900 10.717
2 2013-12-04 5.250 760.300 764.988 -0.138 45.375 9.375 1.250 7.177 1.750 7.134 18.714 -6.675 10.717 轻度污染
3 2013-12-05 6.150 763.275 767.975 0.250 30.000 6.875 2.250 7.177 3.875 7.759 18.551 -10.912 10.717
4 2013-12-06 2.925 760.325 765.075 -0.275 52.750 4.875 1.250 7.177 1.000 7.121 18.239 -6.350 10.717 中度污染
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3676 2023-12-26 -2.050 770.638 773.938 0.925 54.125 8.250 1.500 7.147 0.500 -9.475 2.588 -10.862 12.000
3677 2023-12-27 -3.888 771.538 774.850 -0.538 67.750 7.250 1.125 4.250 0.000 -8.900 5.348 -9.450 12.000
3678 2023-12-28 -3.012 769.138 772.438 -0.038 69.875 6.625 1.000 3.750 0.375 -9.100 3.750 -8.288 12.000 轻度污染
3679 2023-12-29 -2.800 765.112 768.400 -0.938 78.125 5.625 1.125 4.147 2.000 -6.302 3.975 -6.300 12.000 轻度污染
3680 2023-12-30 -1.238 760.250 763.512 0.225 75.125 2.625 1.125 3.875 2.000 -6.562 2.950 -5.625 12.000

3681 rows × 16 columns

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# 初始化标签编码器
le = LabelEncoder()

# 对质量等级进行编码
data['质量等级'] = le.fit_transform(data['质量等级'])
data['季节'] = le.fit_transform(data['季节'])

# 选择特定的特征列
selected_features = ['气温', '气压', '平均海平面大气压', '气压趋势', '湿度', '风向', '平均风速', '最大阵风', '天气情况', '最低气温', '最高气温', '露点温度', '降水时间', '季节', '质量等级']

# 计算选定特征列的相关性
correlation = data[selected_features].corr()['质量等级']

correlation = correlation.abs()

# 按照绝对值降序排序
correlation = correlation.reindex(correlation.abs().sort_values(ascending=False).index)

# 创建一个条形图
plt.figure(figsize=(12, 8))
sns.barplot(x=correlation[1:], hue = correlation.index[1:], legend = 'auto', palette='coolwarm')

# 添加标题和标签
plt.title('质量等级的相关性分析')
plt.xlabel('相关性')
plt.ylabel('特征值')

# 显示图表
plt.show()

print(correlation)

png

质量等级        1.000000
平均风速        0.164287
最大阵风        0.113761
最高气温        0.111750
气压          0.109454
平均海平面大气压    0.106138
露点温度        0.098350
气温          0.095311
气压趋势        0.079601
最低气温        0.076747
风向          0.070087
天气情况        0.067813
降水时间        0.063081
湿度          0.052930
季节          0.043512
Name: 质量等级, dtype: float64

划分训练集,尽可能地减少对难预测的空气污染物成分的依赖

# 将data的值复制到df当中
df = merged_data


# 执行独热编码转换类别字段
df = pd.get_dummies(df, columns=['季节'])


# 预测前,将数据集划分为训练集和验证集,尽可能地减少对难预测的空气污染物成分的依赖
X = df.drop(columns=['质量等级','日期','AQI','O3_8h','NO2','SO2'])
y = df['质量等级']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

使用随机森林的默认状态进行调试

随机森林参数调优

# 使用随机森林分类器进行训练
# classifier = RandomForestClassifier(n_estimators=120, random_state=42)
## 参数调优 
classifier = RandomForestClassifier(n_estimators=50, 
                                    min_samples_leaf=5, 
                                    min_samples_split=10, 
                                    random_state=42)

classifier.fit(X_train, y_train)

# 进行预测并检查准确率
predictions = classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("预测的准确率是:", accuracy)
# 概率
predicted_proba = classifier.predict_proba(X_test)

# 计算混淆矩阵并创建热力图
cm = confusion_matrix(y_test, predictions)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot = True, cmap = 'Blues')
plt.title('随机森林方法')
plt.xlabel('预测值')
plt.ylabel('真实值')



# 训练集上的预测
train_predictions = classifier.predict(X_train)
# 计算准确率、召回率、精确率 和 F1分数
accuracy_train = classifier.score(X_train, y_train)
recall_train = recall_score(y_train, train_predictions, average='weighted')
precision_train = precision_score(y_train, train_predictions, average='weighted',zero_division=1)
f1_train = f1_score(y_train, train_predictions, average='weighted')
accuracy_test = accuracy
recall_test = recall_score(y_test, predictions, average='weighted')
precision_test = precision_score(y_test, predictions, average='weighted',zero_division=1)
f1_test = f1_score(y_test, predictions, average='weighted')
# 创建 DataFrame
performance = pd.DataFrame({
    '准确率': [accuracy_train, accuracy_test],
    '召回率': [recall_train, recall_test],
    '精确率': [precision_train, precision_test],
    'F1': [f1_train, f1_test]
}, index = ['训练集', '测试集'])
# 显示 performance
performance_styler = performance.style.set_properties(**{'text-align': 'center'})
display(performance_styler)
# 构建预测结果对照表
results = pd.DataFrame({
    '真实值': y_test,
    '预测值': predictions
})

# 获得类别列表,按照模型内部的顺序
class_list = classifier.classes_

# 将预测的概率与其对应的类别关联起来
for i, quality_level in enumerate(class_list):
    results[f'{quality_level}预测概率'] = predicted_proba[:, i]

# 使用 .head() 方法获取前20条数据
results_head = results.head(20)

# 设置数据显示为居中格式
results_styler = results_head.style.set_properties(**{'text-align': 'center'})

# 显示居中对齐的前100条数据
display(results_styler)
预测的准确率是: 0.8168249660786974
  准确率 召回率 精确率 F1
训练集 0.920177 0.920177 0.921693 0.918641
测试集 0.816825 0.816825 0.819219 0.815129
  真实值 预测值 严重污染预测概率 中度污染预测概率 优预测概率 无预测概率 良预测概率 轻度污染预测概率 重度污染预测概率
1097 严重污染 重度污染 0.242753 0.028671 0.000000 0.000000 0.007273 0.003818 0.717485
2784 0.002857 0.003500 0.580033 0.000000 0.355595 0.058015 0.000000
2440 0.000000 0.035993 0.159627 0.002500 0.588910 0.212970 0.000000
1694 0.001538 0.008901 0.475104 0.000000 0.389171 0.119131 0.006154
2494 0.000000 0.000000 0.163351 0.002500 0.757525 0.076624 0.000000
2270 轻度污染 轻度污染 0.000000 0.168943 0.012083 0.000000 0.035368 0.750467 0.033139
3477 0.000000 0.008205 0.188131 0.001250 0.676005 0.125076 0.001333
937 中度污染 0.000000 0.530691 0.000000 0.000000 0.141202 0.304054 0.024053
495 中度污染 中度污染 0.064650 0.449891 0.012000 0.000000 0.082585 0.196608 0.194265
798 重度污染 重度污染 0.120026 0.219474 0.015000 0.000000 0.092429 0.229477 0.323595
3375 0.001667 0.000000 0.980167 0.000000 0.016500 0.001667 0.000000
1747 0.006000 0.000000 0.726175 0.005778 0.256968 0.005079 0.000000
1487 重度污染 重度污染 0.151462 0.156333 0.000000 0.000000 0.003333 0.012222 0.676650
969 中度污染 中度污染 0.005934 0.564135 0.003205 0.000000 0.034687 0.221312 0.170726
2883 中度污染 中度污染 0.016024 0.445166 0.000000 0.000000 0.099437 0.374235 0.065138
655 轻度污染 轻度污染 0.000000 0.108666 0.000000 0.000000 0.220001 0.654238 0.017095
229 轻度污染 轻度污染 0.001538 0.299135 0.002857 0.000000 0.083248 0.609888 0.003333
1116 0.000000 0.013553 0.084372 0.000000 0.880953 0.016122 0.005000
840 中度污染 中度污染 0.020206 0.757988 0.000000 0.000000 0.004524 0.162131 0.055151
32 0.000000 0.016905 0.008667 0.000000 0.895619 0.078810 0.000000

png

支持向量机分类算法

# 使用支持向量机分类器进行训练
# 默认参数
# classifier = svm.SVC(probability=True)
# classifier.fit(X_train, y_train)


# 参数调优
classifier = svm.SVC(probability=True, C=0.8, kernel='linear', gamma=0.01)
classifier.fit(X_train, y_train)

# 执行预测并计算准确度
predictions = classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("预测准确率:", accuracy)

# 概率
predicted_proba = classifier.predict_proba(X_test)

# 进行预测并检查准确率
predictions = classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("预测的准确率是:", accuracy)
# 概率
predicted_proba = classifier.predict_proba(X_test)

# 计算混淆矩阵并创建热力图
cm = confusion_matrix(y_test, predictions)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot = True, cmap = 'Blues')
plt.title('支持向量机方法')
plt.xlabel('预测值')
plt.ylabel('真实值')



# 训练集上的预测
train_predictions = classifier.predict(X_train)
# 计算准确率、召回率、精确率 和 F1分数
accuracy_train = classifier.score(X_train, y_train)
recall_train = recall_score(y_train, train_predictions, average='weighted')
precision_train = precision_score(y_train, train_predictions, average='weighted',zero_division=1)
f1_train = f1_score(y_train, train_predictions, average='weighted')
accuracy_test = accuracy
recall_test = recall_score(y_test, predictions, average='weighted')
precision_test = precision_score(y_test, predictions, average='weighted',zero_division=1)
f1_test = f1_score(y_test, predictions, average='weighted')
# 创建 DataFrame
performance = pd.DataFrame({
    '准确率': [accuracy_train, accuracy_test],
    '召回率': [recall_train, recall_test],
    '精确率': [precision_train, precision_test],
    'F1': [f1_train, f1_test]
}, index = ['训练集', '测试集'])
# 显示 performance
performance_styler = performance.style.set_properties(**{'text-align': 'center'})
display(performance_styler)
# 构建预测结果对照表
results = pd.DataFrame({
    '真实值': y_test,
    '预测值': predictions
})

# 获得类别列表,按照模型内部的顺序
class_list = classifier.classes_

# 将预测的概率与其对应的类别关联起来
for i, quality_level in enumerate(class_list):
    results[f'{quality_level}预测概率'] = predicted_proba[:, i]

# 使用 .head() 方法获取前20条数据
results_head = results.head(20)

# 设置数据显示为居中格式
results_styler = results_head.style.set_properties(**{'text-align': 'center'})

# 显示居中对齐的前100条数据
display(results_styler)
预测准确率: 0.7584803256445047
预测的准确率是: 0.7584803256445047
</style>
  准确率 召回率 精确率 F1
训练集 0.754416 0.754416 0.753321 0.751580
测试集 0.758480 0.758480 0.757210 0.755971
  真实值 预测值 严重污染预测概率 中度污染预测概率 优预测概率 无预测概率 良预测概率 轻度污染预测概率 重度污染预测概率
1097 严重污染 重度污染 0.300962 0.053690 0.002110 0.002733 0.001727 0.005712 0.633066
2784 0.001050 0.005165 0.331282 0.001943 0.618779 0.041574 0.000207
2440 0.001645 0.022439 0.061227 0.001645 0.718252 0.193896 0.000895
1694 0.003246 0.005873 0.672221 0.005097 0.286800 0.025789 0.000974
2494 0.001708 0.006182 0.043863 0.000259 0.844592 0.101828 0.001568
2270 轻度污染 轻度污染 0.002498 0.370101 0.001685 0.001179 0.161089 0.424686 0.038762
3477 0.000512 0.032570 0.332566 0.001983 0.524911 0.104358 0.003101
937 中度污染 0.003020 0.459037 0.002037 0.000806 0.056424 0.465143 0.013532
495 中度污染 中度污染 0.014296 0.516219 0.002520 0.002577 0.042034 0.322502 0.099852
798 重度污染 严重污染 0.136483 0.412313 0.015048 0.007002 0.021073 0.056832 0.351248
3375 0.000160 0.001549 0.974973 0.001326 0.019990 0.001005 0.000997
1747 0.005040 0.002388 0.909634 0.000909 0.074456 0.005327 0.002246
1487 重度污染 重度污染 0.035081 0.346609 0.002323 0.001477 0.011112 0.045734 0.557664
969 中度污染 中度污染 0.009966 0.667408 0.000767 0.002853 0.006944 0.102986 0.209076
2883 中度污染 中度污染 0.008358 0.351996 0.007579 0.006394 0.056464 0.448937 0.120271
655 轻度污染 轻度污染 0.001423 0.113934 0.002714 0.000850 0.231265 0.637323 0.012491
229 轻度污染 中度污染 0.002663 0.525737 0.000823 0.000796 0.023640 0.428292 0.018049
1116 0.000173 0.002357 0.047882 0.000070 0.918638 0.030178 0.000702
840 中度污染 中度污染 0.014819 0.517901 0.001452 0.000666 0.015056 0.252036 0.198070
32 0.000178 0.012099 0.004606 0.000033 0.797791 0.183794 0.001499

png

LSTM-Algorithm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt   ##绘图库
import seaborn as sns ##图片风格扩展
from datetime import timedelta  #时间
from sklearn.preprocessing import LabelEncoder  ##label编码
from matplotlib import font_manager  ##解决plot中文字符显示问题             
plt.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体为新宋体。
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像时'-'显示为方块的问题。
from matplotlib import colors
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split   ## 划分
from sklearn.metrics import r2_score
import torch
## 代替keras库,LSTM模型
from torch import nn
from sklearn.preprocessing import MinMaxScaler

# 检查是否有可用的 GPU
if torch.cuda.is_available():
    device = torch.device("cuda")  # 使用GPU
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")  # 使用 CPU
    print("Using CPU")
Using GPU: NVIDIA GeForce MX450

整理天气数据 - 缺失值和异常值处理 - 对列重命名

# 加载 csv文件
weather_data = pd.read_csv('weatherdata.csv')

# 将时间列转化为日期时间格式
weather_data['当地时间 北京'] = pd.to_datetime(weather_data['当地时间 北京'], format='%d.%m.%Y %H:%M')  # 处理中文版本csv文件 

# 遍历数据集中的每一个列
for col in weather_data.columns:
    # 如果数据类型是非数值型并且列名不是'当地时间 北京'
    if pd.api.types.is_string_dtype(weather_data[col]) and col != '当地时间 北京':  # 对中文版csv文件操作
        # 使用factorize对非数值型的列进行编码
        weather_data[col] = pd.factorize(weather_data[col])[0]

# 下面的步骤只会对数值类型的列进行操作
numeric_columns = weather_data.columns[weather_data.dtypes != 'object']

# 使用每列的平均值填充对应列的缺失值
weather_data[numeric_columns] = weather_data[numeric_columns].fillna(weather_data[numeric_columns].mean())
# 删除含有 NaN 的列
weather_data = weather_data.dropna(axis=1)

# 定义新的列名
column_dict = {'当地时间 北京':'日期','T': '气温', 'Po': '气压', 'P': '平均海平面大气压', 'Pa': '气压趋势', 'U': '湿度', 'DD': '风向', 'Ff': '平均风速', 'ff3': '最大阵风', 'WW': '天气情况', 'Tn': '最低气温', 'Tx': '最高气温', 'Td': '露点温度', 'tR': '降水时间'}

# 更改列名
weather_data = weather_data.rename(columns=column_dict)

# 从'日期'列中提取日期
weather_data['日期'] = pd.to_datetime(weather_data['日期']).dt.date

# 计算每日平均值
daily_avg_data = weather_data.groupby('日期').mean().reset_index()

daily_avg_data = daily_avg_data.round(3)

# 检查结果
daily_avg_data
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
日期 气温 气压 平均海平面大气压 气压趋势 湿度 风向 平均风速 最大阵风 天气情况 最低气温 最高气温 露点温度 降水时间
0 2013-12-02 3.162 759.712 764.462 -0.013 45.875 7.375 1.375 7.177 5.500 6.896 18.301 -8.588 10.717
1 2013-12-03 5.488 761.725 766.425 0.100 39.000 8.000 1.625 7.177 4.750 7.209 18.314 -8.900 10.717
2 2013-12-04 5.250 760.300 764.988 -0.138 45.375 9.375 1.250 7.177 1.750 7.134 18.714 -6.675 10.717
3 2013-12-05 6.150 763.275 767.975 0.250 30.000 6.875 2.250 7.177 3.875 7.759 18.551 -10.912 10.717
4 2013-12-06 2.925 760.325 765.075 -0.275 52.750 4.875 1.250 7.177 1.000 7.121 18.239 -6.350 10.717
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3676 2023-12-26 -2.050 770.638 773.938 0.925 54.125 8.250 1.500 7.147 0.500 -9.475 2.588 -10.862 12.000
3677 2023-12-27 -3.888 771.538 774.850 -0.538 67.750 7.250 1.125 4.250 0.000 -8.900 5.348 -9.450 12.000
3678 2023-12-28 -3.012 769.138 772.438 -0.038 69.875 6.625 1.000 3.750 0.375 -9.100 3.750 -8.288 12.000
3679 2023-12-29 -2.800 765.112 768.400 -0.938 78.125 5.625 1.125 4.147 2.000 -6.302 3.975 -6.300 12.000
3680 2023-12-30 -1.238 760.250 763.512 0.225 75.125 2.625 1.125 3.875 2.000 -6.562 2.950 -5.625 12.000

3681 rows × 14 columns

合并天气数据与空气质量数据

# 确保两个 DataFrame 的日期列都是日期格式
aqi_data = pd.read_csv('SeasonAdded.csv')

aqi_data['日期'] = pd.to_datetime(aqi_data['日期'])
daily_avg_data['日期'] = pd.to_datetime(daily_avg_data['日期'])

# 合并两个 DataFrame
merged_data = pd.merge(daily_avg_data, aqi_data, on='日期', how='inner')

merged_data.to_csv('merged_data.csv', index=False)

merged_data
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
日期 气温 气压 平均海平面大气压 气压趋势 湿度 风向 平均风速 最大阵风 天气情况 ... 降水时间 季节 AQI 质量等级 PM2.5 PM10 NO2 CO SO2 O3_8h
0 2013-12-02 3.162 759.712 764.462 -0.013 45.875 7.375 1.375 7.177 5.500 ... 10.717 142 轻度污染 109 138 88 2.6 61 11
1 2013-12-03 5.488 761.725 766.425 0.100 39.000 8.000 1.625 7.177 4.750 ... 10.717 86 64 86 54 1.6 38 45
2 2013-12-04 5.250 760.300 764.988 -0.138 45.375 9.375 1.250 7.177 1.750 ... 10.717 109 轻度污染 82 101 62 2.0 42 23
3 2013-12-05 6.150 763.275 767.975 0.250 30.000 6.875 2.250 7.177 3.875 ... 10.717 56 39 56 38 1.2 30 52
4 2013-12-06 2.925 760.325 765.075 -0.275 52.750 4.875 1.250 7.177 1.000 ... 10.717 169 中度污染 128 162 78 2.5 48 15
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3676 2023-12-26 -2.050 770.638 773.938 0.925 54.125 8.250 1.500 7.147 0.500 ... 12.000 55 26 46 44 0.7 3 0
3677 2023-12-27 -3.888 771.538 774.850 -0.538 67.750 7.250 1.125 4.250 0.000 ... 12.000 64 45 71 51 0.8 3 34
3678 2023-12-28 -3.012 769.138 772.438 -0.038 69.875 6.625 1.000 3.750 0.375 ... 12.000 129 轻度污染 98 132 69 1.2 3 21
3679 2023-12-29 -2.800 765.112 768.400 -0.938 78.125 5.625 1.125 4.147 2.000 ... 12.000 150 轻度污染 115 145 62 1.2 3 45
3680 2023-12-30 -1.238 760.250 763.512 0.225 75.125 2.625 1.125 3.875 2.000 ... 12.000 32 12 29 21 0.3 2 64

3681 rows × 23 columns

探究各种空气质量指数与天气特征的相关性

# 定义你需要的列
columns = ['气温', '气压', '平均海平面大气压', '气压趋势', '湿度', '风向', '平均风速', '最大阵风', '天气情况', '最低气温', '最高气温', '露点温度', '降水时间', '季节', 'PM2.5', 'PM10', 'NO2', 'CO', 'SO2', 'O3_8h']

# 初始化编码器
encoder = LabelEncoder()

# 对"季节"进行标签编码
merged_data['季节'] = encoder.fit_transform(merged_data['季节'])

# 目标列列表
targets = ['PM2.5', 'PM10', 'NO2', 'CO', 'SO2', 'O3_8h']

# 对每个目标列进行相关性分析
for target in targets:

    # 暂时移除其他目标列
    temp_columns = [c for c in columns if c not in targets or c == target]

    # 选择这些列进行相关性分析
    selected_data = merged_data[temp_columns]
    correlation = selected_data.corr()

    # 找到与目标列相关的列并按相关性的绝对值进行降序排列
    target_correlation_abs = correlation[target].abs().sort_values(ascending=False)

    # 打印结果
    print(f'\n{target}的相关性分析结果:\n')
    print(target_correlation_abs)

    # 创建一个新的图像并设置其大小
    plt.figure(figsize=(10,10))

    # 创建颜色映射,使用颜色渐变。取决对系数值的绝对值创建颜色映射,以表示相关性强度
    color_map = colors.LinearSegmentedColormap.from_list("", ["green","yellow","red"])

    # 创建条形图,并用颜色映射来设置每个条的颜色
    ## sns.barplot(x=target_correlation_abs.values, y=target_correlation_abs.index, palette=sns.color_palette("coolwarm", len(target_correlation_abs)))
    sns.barplot(x=target_correlation_abs.values, hue = target_correlation_abs.index, legend=True, palette=sns.color_palette("coolwarm", len(target_correlation_abs)))

    # 添加标题和标签
    plt.title(f'{target}的相关性系数条形图')
    plt.xlabel('相关系数值')
    plt.ylabel('特征')

    # 显示图像
    plt.show()
PM2.5的相关性分析结果:

PM2.5       1.000000
平均风速        0.288130
湿度          0.254683
风向          0.181265
气温          0.156376
气压趋势        0.154640
最大阵风        0.146046
平均海平面大气压    0.105068
最低气温        0.100934
最高气温        0.088201
气压          0.075170
降水时间        0.048521
季节          0.035743
天气情况        0.015611
露点温度        0.000350
Name: PM2.5, dtype: float64

png

PM10的相关性分析结果:

PM10        1.000000
平均风速        0.183021
气压趋势        0.160852
气温          0.124652
风向          0.116126
最低气温        0.104165
最大阵风        0.087421
露点温度        0.071516
最高气温        0.056746
平均海平面大气压    0.052922
湿度          0.049332
天气情况        0.035941
降水时间        0.033476
气压          0.027195
季节          0.016593
Name: PM10, dtype: float64

png

NO2的相关性分析结果:

NO2         1.000000
气温          0.313374
平均风速        0.305270
平均海平面大气压    0.297695
气压          0.253524
最低气温        0.227680
露点温度        0.223513
最高气温        0.197564
气压趋势        0.167140
风向          0.118285
最大阵风        0.115511
降水时间        0.100632
天气情况        0.099558
湿度          0.011815
季节          0.011596
Name: NO2, dtype: float64

png

CO的相关性分析结果:

CO          1.000000
平均风速        0.290830
气温          0.247104
湿度          0.232125
风向          0.213775
平均海平面大气压    0.178089
季节          0.156474
气压          0.142585
最大阵风        0.119038
气压趋势        0.102697
天气情况        0.085846
最高气温        0.080423
露点温度        0.077970
降水时间        0.074248
最低气温        0.073578
Name: CO, dtype: float64

png

SO2的相关性分析结果:

SO2         1.000000
气温          0.306540
露点温度        0.254694
平均海平面大气压    0.233613
季节          0.227524
气压          0.198480
风向          0.131312
平均风速        0.075902
湿度          0.075378
最低气温        0.073285
最高气温        0.072347
气压趋势        0.066544
降水时间        0.013580
天气情况        0.011971
最大阵风        0.003593
Name: SO2, dtype: float64

png

O3_8h的相关性分析结果:

O3_8h       1.000000
气温          0.740916
平均海平面大气压    0.685109
气压          0.682996
露点温度        0.616646
最高气温        0.500138
最低气温        0.460675
降水时间        0.140944
湿度          0.113398
季节          0.060095
平均风速        0.059269
气压趋势        0.026255
天气情况        0.007104
最大阵风        0.004726
风向          0.001602
Name: O3_8h, dtype: float64

png

merged_data
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
日期 气温 气压 平均海平面大气压 气压趋势 湿度 风向 平均风速 最大阵风 天气情况 ... 降水时间 季节 AQI 质量等级 PM2.5 PM10 NO2 CO SO2 O3_8h
0 2013-12-02 3.162 759.712 764.462 -0.013 45.875 7.375 1.375 7.177 5.500 ... 10.717 0 142 轻度污染 109 138 88 2.6 61 11
1 2013-12-03 5.488 761.725 766.425 0.100 39.000 8.000 1.625 7.177 4.750 ... 10.717 0 86 64 86 54 1.6 38 45
2 2013-12-04 5.250 760.300 764.988 -0.138 45.375 9.375 1.250 7.177 1.750 ... 10.717 0 109 轻度污染 82 101 62 2.0 42 23
3 2013-12-05 6.150 763.275 767.975 0.250 30.000 6.875 2.250 7.177 3.875 ... 10.717 0 56 39 56 38 1.2 30 52
4 2013-12-06 2.925 760.325 765.075 -0.275 52.750 4.875 1.250 7.177 1.000 ... 10.717 0 169 中度污染 128 162 78 2.5 48 15
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3676 2023-12-26 -2.050 770.638 773.938 0.925 54.125 8.250 1.500 7.147 0.500 ... 12.000 0 55 26 46 44 0.7 3 0
3677 2023-12-27 -3.888 771.538 774.850 -0.538 67.750 7.250 1.125 4.250 0.000 ... 12.000 0 64 45 71 51 0.8 3 34
3678 2023-12-28 -3.012 769.138 772.438 -0.038 69.875 6.625 1.000 3.750 0.375 ... 12.000 0 129 轻度污染 98 132 69 1.2 3 21
3679 2023-12-29 -2.800 765.112 768.400 -0.938 78.125 5.625 1.125 4.147 2.000 ... 12.000 0 150 轻度污染 115 145 62 1.2 3 45
3680 2023-12-30 -1.238 760.250 763.512 0.225 75.125 2.625 1.125 3.875 2.000 ... 12.000 0 32 12 29 21 0.3 2 64

3681 rows × 23 columns

构建合适的特征变量组,从各个目标的最相关特征值中按照频次提取出最常见的特征

from collections import Counter

feature_dict = {
    'PM2.5': ['平均风速', '湿度', '风向', '气温', '气压趋势', '最大阵风', '平均海平面大气压', '最低气温'],
    'PM10': ['平均风速', '气压趋势', '气温', '风向', '最低气温'],
    'NO2': ['气温', '平均风速', '平均海平面大气压', '气压', '最低气温', '露点温度', '最高气温', '气压趋势', '风向', '最大阵风', '降水时间'],
    'CO': ['平均风速', '气温', '湿度', '风向', '平均海平面大气压', '季节', '气压', '最大阵风', '气压趋势'],
    'SO2': ['气温', '露点温度', '平均海平面大气压', '季节', '气压', '风向'],
    'O3_8h': ['气温', '平均海平面大气压', '气压', '露点温度', '最高气温', '最低气温', '降水时间', '湿度']
}

# 提取所有特征
all_features = [feature for sublist in feature_dict.values() for feature in sublist]

# 计算每个特征的频率
feature_counts = Counter(all_features)


# 提取最常见的特征
common_features = [feature[0] for feature in feature_counts.most_common()]

#添加到feature_cols
feature_cols = common_features + ['PM2.5', 'PM10', 'NO2', 'CO', 'SO2', 'O3_8h']

时间序列模型预测未来数据, 通过调整参数发现设置时序步长为1时,预测效果最好

# 特征和目标变量选择
feature_cols = ['气温', '平均风速', '平均海平面大气压', '气压', '最低气温', '露点温度', '最高气温', '气压趋势', '风向', '最大阵风', '降水时间','季节','PM2.5', 'PM10', 'NO2', 'CO', 'SO2', 'O3_8h'] 
target_cols = ['PM2.5', 'PM10', 'NO2', 'CO', 'SO2', 'O3_8h']

X = merged_data[feature_cols].values
y = merged_data[target_cols].values

# 数据规范化
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X)
y = scaler.fit_transform(y)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 转换输入形状为(samples, time steps, features)
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.Tensor(x)
        output, (hn, cn) = self.lstm(x)
        hn = hn.view(-1, self.hidden_dim)
        out = self.fc(hn)
        return out
    
model = LSTMModel(X_train.shape[2], 128, len(target_cols))
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-8) # 加入了权重衰减项

epochs = 90
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model.forward(X_train)
    optimizer.zero_grad()
    loss = criterion(outputs, torch.Tensor(y_train))
    loss.backward()
    optimizer.step()
    if epoch % 1 == 0:  # print loss every 1 epochs
        print(f'Epoch: {epoch} \tLoss: {loss.item()}')

with torch.no_grad():
    y_train_pred = model.forward(X_train)
    y_test_pred = model.forward(X_test)
train_loss = criterion(y_train_pred, torch.Tensor(y_train)).item() 
test_loss = criterion(y_test_pred, torch.Tensor(y_test)).item()

# 训练集和测试集上的损失
print("Train loss: %.3f" % train_loss)
print("Test loss: %.3f" % test_loss)


# 获取预测结果,并进行反归一化
y_train_pred = model(X_train).detach().numpy()
y_test_pred = model(X_test).detach().numpy()
y_train_pred = scaler.inverse_transform(y_train_pred)
y_test_pred = scaler.inverse_transform(y_test_pred)

# 对真实结果进行反归一化
y_train_real = scaler.inverse_transform(y_train)
y_test_real = scaler.inverse_transform(y_test)

# 为预测结果和真实结果创建DataFrame
train_dict = {}
test_dict = {}
for idx, col in enumerate(target_cols):
    train_dict[col+'_pred'] = y_train_pred[:, idx]
    train_dict[col+'_real'] = y_train_real[:, idx]
    test_dict[col+'_pred'] = y_test_pred[:, idx]
    test_dict[col+'_real'] = y_test_real[:, idx]
train_result = pd.DataFrame(train_dict)
test_result = pd.DataFrame(test_dict)

# 计算RMSE
train_rmse = np.sqrt(train_loss)
test_rmse = np.sqrt(test_loss)

print("Train RMSE: %.3f" % train_rmse)
print("Test RMSE: %.3f" % test_rmse)

# 训练集的R2 score
r2_train = r2_score(y_train_real, y_train_pred)
print("Train R^2: %.3f" % r2_train)

# 测试集的R2 score
r2_test = r2_score(y_test_real, y_test_pred)
print("Test R^2: %.3f" % r2_test)

# 显示结果
test_result.to_csv('测试集结果.csv')

# 使用全量的数据进行预测
all_data = X  # 注意这里直接使用了全部的X数据

all_data = np.reshape(all_data, (all_data.shape[0], 1, all_data.shape[1]))  # 调整数据形状以符合模型输入要求
with torch.no_grad():
    all_data_pred = model(all_data)  # 使用模型进行预测
    
all_data_pred = all_data_pred.detach().numpy()
all_data_pred = scaler.inverse_transform(all_data_pred)  # 将预测数据进行反归一化

# 获取未来xx天的预测数据
last_30_days_predictions = all_data_pred[-30:, :]
predictions_df = pd.DataFrame(last_30_days_predictions, columns=target_cols)
predictions_df = predictions_df.select_dtypes(include=['int', 'float','float64']).astype(float)
predictions_df = predictions_df.round(3)   ##保留三位小数
predictions_df[predictions_df<0] = 0   ## 去除负数


test_result
Epoch: 0 	Loss: 0.06279667466878891
Epoch: 1 	Loss: 0.03268563747406006
Epoch: 2 	Loss: 0.016131838783621788
Epoch: 3 	Loss: 0.020107517018914223
Epoch: 4 	Loss: 0.019965996965765953
Epoch: 5 	Loss: 0.016072137281298637
Epoch: 6 	Loss: 0.015939587727189064
Epoch: 7 	Loss: 0.0146373575553298
Epoch: 8 	Loss: 0.01211472973227501
Epoch: 9 	Loss: 0.01126999780535698
Epoch: 10 	Loss: 0.01241952646523714
Epoch: 11 	Loss: 0.012912060134112835
Epoch: 12 	Loss: 0.01120476983487606
Epoch: 13 	Loss: 0.008758388459682465
Epoch: 14 	Loss: 0.007574760355055332
Epoch: 15 	Loss: 0.0078633613884449
Epoch: 16 	Loss: 0.00812815222889185
Epoch: 17 	Loss: 0.007398997433483601
Epoch: 18 	Loss: 0.006387569010257721
Epoch: 19 	Loss: 0.005937131587415934
Epoch: 20 	Loss: 0.005659068934619427
Epoch: 21 	Loss: 0.0049650222063064575
Epoch: 22 	Loss: 0.004351222887635231
Epoch: 23 	Loss: 0.004472789354622364
Epoch: 24 	Loss: 0.004842758644372225
Epoch: 25 	Loss: 0.004543093033134937
Epoch: 26 	Loss: 0.003780539147555828
Epoch: 27 	Loss: 0.0034654217306524515
Epoch: 28 	Loss: 0.003692377358675003
Epoch: 29 	Loss: 0.0037616209592670202
Epoch: 30 	Loss: 0.0034925437066704035
Epoch: 31 	Loss: 0.0032813141588121653
Epoch: 32 	Loss: 0.003235272131860256
Epoch: 33 	Loss: 0.003155637299641967
Epoch: 34 	Loss: 0.0030356040224432945
Epoch: 35 	Loss: 0.003006354672834277
Epoch: 36 	Loss: 0.0030242251232266426
Epoch: 37 	Loss: 0.002940405858680606
Epoch: 38 	Loss: 0.002780000679194927
Epoch: 39 	Loss: 0.0026939152739942074
Epoch: 40 	Loss: 0.002677084179595113
Epoch: 41 	Loss: 0.002596325008198619
Epoch: 42 	Loss: 0.0024548254441469908
Epoch: 43 	Loss: 0.0023730238899588585
Epoch: 44 	Loss: 0.0023465296253561974
Epoch: 45 	Loss: 0.002263784408569336
Epoch: 46 	Loss: 0.0021349024027585983
Epoch: 47 	Loss: 0.002071073977276683
Epoch: 48 	Loss: 0.002063565421849489
Epoch: 49 	Loss: 0.0020039009395986795
Epoch: 50 	Loss: 0.0018945776391774416
Epoch: 51 	Loss: 0.0018265795661136508
Epoch: 52 	Loss: 0.0017962786369025707
Epoch: 53 	Loss: 0.001734274672344327
Epoch: 54 	Loss: 0.0016526338877156377
Epoch: 55 	Loss: 0.0015956064453348517
Epoch: 56 	Loss: 0.0015411977656185627
Epoch: 57 	Loss: 0.0014713113196194172
Epoch: 58 	Loss: 0.0014099245890974998
Epoch: 59 	Loss: 0.0013530300930142403
Epoch: 60 	Loss: 0.0012927282368764281
Epoch: 61 	Loss: 0.001241533667780459
Epoch: 62 	Loss: 0.0011791670694947243
Epoch: 63 	Loss: 0.0011202717432752252
Epoch: 64 	Loss: 0.0010907152900472283
Epoch: 65 	Loss: 0.001041788375005126
Epoch: 66 	Loss: 0.0009955406421795487
Epoch: 67 	Loss: 0.0009633706649765372
Epoch: 68 	Loss: 0.0009349179454147816
Epoch: 69 	Loss: 0.0009026097250171006
Epoch: 70 	Loss: 0.0008717221789993346
Epoch: 71 	Loss: 0.0008441174868494272
Epoch: 72 	Loss: 0.0008267457014881074
Epoch: 73 	Loss: 0.0007946713594719768
Epoch: 74 	Loss: 0.0007775386329740286
Epoch: 75 	Loss: 0.0007490086136385798
Epoch: 76 	Loss: 0.0007301571895368397
Epoch: 77 	Loss: 0.0007017483003437519
Epoch: 78 	Loss: 0.0006775397341698408
Epoch: 79 	Loss: 0.0006535137072205544
Epoch: 80 	Loss: 0.0006273516337387264
Epoch: 81 	Loss: 0.0006005215109325945
Epoch: 82 	Loss: 0.0005815316108055413
Epoch: 83 	Loss: 0.0005547624314203858
Epoch: 84 	Loss: 0.0005313107394613326
Epoch: 85 	Loss: 0.0005118245608173311
Epoch: 86 	Loss: 0.0004902610089629889
Epoch: 87 	Loss: 0.0004675433156080544
Epoch: 88 	Loss: 0.00045090654748491943
Epoch: 89 	Loss: 0.0004318800347391516
Train loss: 0.000
Test loss: 0.000
Train RMSE: 0.020
Test RMSE: 0.021
Train R^2: 0.965
Test R^2: 0.961
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
PM2.5_pred PM2.5_real PM10_pred PM10_real NO2_pred NO2_real CO_pred CO_real SO2_pred SO2_real O3_8h_pred O3_8h_real
0 260.290833 252.0 300.762695 284.0 101.588753 104.0 3.581058 3.6 19.884743 18.0 9.184221 17.0
1 14.866858 16.0 35.614517 26.0 12.507151 12.0 0.653138 0.6 3.453637 3.0 72.833450 71.0
2 22.591690 22.0 43.702221 40.0 16.390390 18.0 0.568127 0.6 3.025825 2.0 157.727448 159.0
3 16.380686 14.0 21.891182 18.0 17.284372 17.0 0.636918 0.5 1.324495 3.0 63.488735 55.0
4 37.445534 43.0 57.895954 47.0 31.493208 28.0 0.538045 0.6 0.203518 2.0 62.715031 59.0
... ... ... ... ... ... ... ... ... ... ... ... ...
732 34.874565 28.0 68.627998 75.0 37.168175 41.0 0.463779 0.4 5.629795 5.0 139.438385 142.0
733 15.659326 16.0 18.800871 21.0 37.032345 34.0 0.719412 0.6 16.122654 19.0 46.495991 40.0
734 0.740011 5.0 13.212113 16.0 21.989586 13.0 0.122255 0.2 0.011574 2.0 62.865829 63.0
735 25.367023 18.0 41.213257 47.0 44.415665 48.0 0.583967 0.5 4.592342 3.0 67.892738 70.0
736 12.220972 14.0 51.202724 38.0 20.120926 19.0 0.363505 0.4 2.447656 3.0 51.515011 58.0

737 rows × 12 columns

绘制拟合曲线

# 选择目标变量
target_var = 'PM2.5'
# 选择要绘制的数据条数
n_points = 200

plt.figure(figsize=(14, 6))
plt.plot(test_result[target_var + '_real'].values[:n_points], label='Real')
plt.plot(test_result[target_var + '_pred'].values[:n_points], label='Predicted')
plt.xlabel('Time')
plt.ylabel(target_var)
plt.legend()
plt.title(f'{target_var} 预测值 vs 真实值')
plt.grid(True)
plt.show()

png

绘制误差图

# 计算预测与真实值间的差值
y_diff_train = train_result[target_var+'_real'] - train_result[target_var+'_pred']
y_diff_test = test_result[target_var+'_real'] - test_result[target_var+'_pred']

# 设定图形尺寸
plt.figure(figsize=(14, 6))

# 绘制训练集误差图
plt.plot(y_diff_train.values[:200], 'b-', label='训练数据', alpha=0.6)
plt.fill_between(np.arange(200), y_diff_train[:200], color='blue', alpha=0.3)

# 绘制测试集误差图
plt.plot(y_diff_test.values[:200], 'r-', label='测试数据', alpha=0.6)
plt.fill_between(np.arange(200), y_diff_test[:200], color='red', alpha=0.3)

# 设置标题和标签
plt.xlabel('时间戳')
plt.ylabel('误差')
plt.legend(loc='upper right')

plt.grid(True)  # 添加网格线
plt.show()

png

预测出的未来30天的天气数据

last_day = pd.to_datetime("2023-12-30")  # 最后一天的日期
date_range = pd.date_range(start=last_day + pd.DateOffset(days=1), periods=30)
predictions_df.index = date_range
predictions_df
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
PM2.5 PM10 NO2 CO SO2 O3_8h
2023-12-31 35.268 55.408 37.076 0.600 3.823 56.100
2024-01-01 47.289 62.437 46.951 0.896 4.110 33.773
2024-01-02 75.626 103.339 53.415 1.216 6.150 25.145
2024-01-03 91.936 123.706 56.581 1.357 5.109 22.094
2024-01-04 62.691 141.266 21.805 0.542 0.000 66.851
2024-01-05 126.241 239.232 53.348 1.266 2.239 28.074
2024-01-06 125.366 202.559 55.826 1.452 4.422 30.151
2024-01-07 58.408 94.358 35.112 0.865 1.120 39.813
2024-01-08 36.252 60.305 33.400 0.871 5.222 26.231
2024-01-09 27.063 44.213 28.812 0.600 1.435 43.265
2024-01-10 57.066 70.538 38.374 0.928 1.353 24.144
2024-01-11 87.188 96.541 46.062 1.215 1.138 21.397
2024-01-12 51.797 68.873 29.828 0.847 3.088 41.904
2024-01-13 10.964 28.064 13.281 0.322 0.000 15.794
2024-01-14 7.373 29.943 14.160 0.230 0.000 11.704
2024-01-15 20.184 43.908 28.214 0.403 1.184 9.095
2024-01-16 57.648 76.877 49.969 0.919 4.052 6.342
2024-01-17 23.801 41.559 23.137 0.419 0.025 16.249
2024-01-18 6.101 28.667 17.021 0.227 0.000 11.612
2024-01-19 8.630 35.534 19.632 0.193 0.000 13.358
2024-01-20 23.818 41.332 35.717 0.483 1.306 8.622
2024-01-21 26.983 46.250 30.273 0.506 1.484 16.506
2024-01-22 37.338 51.321 38.070 0.629 1.249 12.827
2024-01-23 64.536 75.616 55.692 1.063 4.004 7.494
2024-01-24 45.481 56.737 40.346 0.773 1.411 12.047
2024-01-25 36.227 45.540 40.347 0.822 1.926 6.843
2024-01-26 56.105 67.965 46.040 0.919 3.578 39.221
2024-01-27 106.544 128.567 61.658 1.450 2.996 24.674
2024-01-28 118.371 147.475 57.756 1.412 2.956 48.443
2024-01-29 20.742 29.503 22.414 0.347 0.000 65.227
import pandas as pd
import numpy as np

# 首先,定义一个函数来计算每种污染物的AQI
def cal_IAQI(pollutant, concentration):
    standard = {
        'PM2.5': [(0, 35), (35, 75), (75, 115), (115, 150), (150, 250), (250, 350), (350, 500)],
        'PM10': [(0, 50), (50, 150), (150, 250), (250, 350), (350, 420), (420, 500), (500, 600)],
        'SO2': [(0, 50), (50, 150), (150, 475), (475, 800), (800, 1600), (1601, 2100), (2100, 2620)],
        'NO2': [(0, 40), (40, 80), (80, 180), (180, 280), (280, 565), (565, 750), (750, 940)],
        'CO': [(0, 5), (5, 10), (10, 35), (35, 60), (60, 90), (90, 120), (120, 150)],
        'O3_8h': [(0, 100), (100, 160), (160, 215), (215, 265), (265, 800)]}
    IAQI = [(0, 50), (50, 100), (100, 150), (150, 200), (200, 300), (300, 400), (400, 500)]
    BP = standard[pollutant]

    for i in range(len(BP)):
        if BP[i][0] <= concentration <= BP[i][1]:
            return ( (IAQI[i][1] - IAQI[i][0]) / (BP[i][1] - BP[i][0]) ) * (concentration - BP[i][0]) + IAQI[i][0]
    return np.nan  # 返回NaN,之后可以方便地替换成AQI的均值

# 定义一个函数来计算总的AQI
def cal_AQI(row):
    pollutants = ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3_8h']
    IAQIs = [cal_IAQI(pollutant, row[pollutant]) for pollutant in pollutants]
    return max(IAQIs)

# 计算AQI
predictions_df['calculated_AQI'] = predictions_df.apply(cal_AQI, axis=1)

# 处理异常值:用AQI的均值替换NaN
predictions_df.loc[:, 'calculated_AQI'] = predictions_df.loc[:, 'calculated_AQI'].fillna(predictions_df['calculated_AQI'].mean())


predictions_df['calculated_AQI'] = predictions_df['calculated_AQI'].round(3)

# AQI到空气质量等级的映射
aqi_to_air_quality = [(0, 50, '优'), (51, 100, '良'), (101, 150, '轻度污染'), (151, 200, '中度污染'), 
                      (201, 300, '重度污染'), (301, 500, '严重污染'), (501, float('inf'), '超出范围')]

# 根据AQI值获取空气质量等级
def get_air_quality(aqi):
    for low, high, quality in aqi_to_air_quality:
        if low <= aqi <= high:
            return quality
    return quality

# 计算空气质量等级列
predictions_df['air_quality'] = predictions_df['calculated_AQI'].apply(get_air_quality)

predictions_df.round(3)
predictions_df.to_csv('pred.csv')

predictions_df
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
PM2.5 PM10 NO2 CO SO2 O3_8h calculated_AQI air_quality
2023-12-31 35.268 55.408 37.076 0.600 3.823 56.100 52.704
2024-01-01 47.289 62.437 46.951 0.896 4.110 33.773 65.361
2024-01-02 75.626 103.339 53.415 1.216 6.150 25.145 100.782 超出范围
2024-01-03 91.936 123.706 56.581 1.357 5.109 22.094 121.170 轻度污染
2024-01-04 62.691 141.266 21.805 0.542 0.000 66.851 95.633
2024-01-05 126.241 239.232 53.348 1.266 2.239 28.074 166.059 中度污染
2024-01-06 125.366 202.559 55.826 1.452 4.422 30.151 164.809 中度污染
2024-01-07 58.408 94.358 35.112 0.865 1.120 39.813 79.260
2024-01-08 36.252 60.305 33.400 0.871 5.222 26.231 55.152
2024-01-09 27.063 44.213 28.812 0.600 1.435 43.265 44.213
2024-01-10 57.066 70.538 38.374 0.928 1.353 24.144 77.583
2024-01-11 87.188 96.541 46.062 1.215 1.138 21.397 115.235 轻度污染
2024-01-12 51.797 68.873 29.828 0.847 3.088 41.904 70.996
2024-01-13 10.964 28.064 13.281 0.322 0.000 15.794 28.064
2024-01-14 7.373 29.943 14.160 0.230 0.000 11.704 29.943
2024-01-15 20.184 43.908 28.214 0.403 1.184 9.095 43.908
2024-01-16 57.648 76.877 49.969 0.919 4.052 6.342 78.310
2024-01-17 23.801 41.559 23.137 0.419 0.025 16.249 41.559
2024-01-18 6.101 28.667 17.021 0.227 0.000 11.612 28.667
2024-01-19 8.630 35.534 19.632 0.193 0.000 13.358 35.534
2024-01-20 23.818 41.332 35.717 0.483 1.306 8.622 44.646
2024-01-21 26.983 46.250 30.273 0.506 1.484 16.506 46.250
2024-01-22 37.338 51.321 38.070 0.629 1.249 12.827 52.922
2024-01-23 64.536 75.616 55.692 1.063 4.004 7.494 86.920
2024-01-24 45.481 56.737 40.346 0.773 1.411 12.047 63.101
2024-01-25 36.227 45.540 40.347 0.822 1.926 6.843 51.534
2024-01-26 56.105 67.965 46.040 0.919 3.578 39.221 76.381
2024-01-27 106.544 128.567 61.658 1.450 2.996 24.674 139.430 轻度污染
2024-01-28 118.371 147.475 57.756 1.412 2.956 48.443 154.816 中度污染
2024-01-29 20.742 29.503 22.414 0.347 0.000 65.227 32.614

通过各项指标计算预测数据的最终的AQI

import matplotlib.pyplot as plt
import numpy as np

# 假设 predictions_df 是包含上述数据的DataFrame
quality_levels = ['优', '良', '轻度污染', '中度污染', '重度污染', '严重污染']

# 在这里定义AQI质量等级的颜色
color_dict = {'优': 'green', 
              '良': 'yellow', 
              '轻度污染': 'orange', 
              '中度污染': 'red', 
              '重度污染': 'purple', 
              '严重污染': 'brown'}

# 计算每个等级的频率
quality_counts = predictions_df['air_quality'].value_counts()

# 创建图形和子图
fig, ax = plt.subplots(figsize=(6, 6))  # 这行是新增的

# 绘制饼图
explode = [0]*len(quality_counts)  # 创建全零的列表,长度与quality_counts相同
explode[quality_counts.index.get_loc('优')] = 0.1  # 只让'良'凸出

ax.pie(quality_counts, 
       labels=quality_counts.index, 
       colors=[color_dict[quality] for quality in quality_counts.index],
       autopct='%1.1f%%',  # 显示百分比
       startangle=140,
       explode=explode,  # 设置凸出部分
       shadow=True)  # 添加阴影

plt.show()

png

import matplotlib.pyplot as plt

# 列表,包括要展示的空气质量参数
pollutants = ['PM2.5', 'PM10', 'CO', 'SO2', 'NO2', 'O3_8h']

# 创建一个新的绘图窗口
plt.figure(figsize=(12, 6))

# 对每个污染物的浓度进行绘图
for i, pollutant in enumerate(pollutants):
    plt.plot(predictions_df.index, predictions_df[pollutant], label=pollutant, linestyle=['-', '--', '-.', ':'][i%4], linewidth=2)

# 添加标签
plt.xlabel('日期')
plt.ylabel('指数')
plt.title('各空气污染物浓度变化')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))

# 添加网格线
plt.grid(True, which='both', linestyle='--', linewidth=0.5)

# 调整x轴刻度
plt.xticks(rotation=45)

# 显示图形
plt.tight_layout()
plt.show()

png

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

# 创建绘图窗口
fig, ax = plt.subplots(figsize=(12, 6))

# 绘制AQI的变化
bars = ax.bar(predictions_df.index, predictions_df['calculated_AQI'], 
              color=predictions_df['calculated_AQI'].apply(lambda x: cm.YlGnBu(x/200)),  # 使用YlGnBu调色板,绿色到黄色
              alpha=1,        # 透明度
              edgecolor='w')    # 条形的边缘颜色

# 设置颜色条
norm = plt.Normalize(predictions_df['calculated_AQI'].min(), predictions_df['calculated_AQI'].max())
sm = cm.ScalarMappable(cmap='YlGnBu', norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax)
cbar.set_label('AQI')

# 设置轴标签和标题
ax.set_xlabel('日期')
ax.set_ylabel('AQI')
ax.set_title('AQI指数的变化')

# 显示图形
plt.show()

png