一、规划问题

(1)线性规划

  1. pulp解决简单线性规划问题
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import pulp

# 1.定义一个规划问题
MyProbLP = pulp.LpProblem("LPProbDemo1", sense=pulp.LpMaximize) #LpMinimize/LpMaximize

# 2.定义决策变量
x1 = pulp.LpVariable('x1', lowBound=0, upBound=7, cat='Continuous') #Integer/Binary/Continuous
x2 = pulp.LpVariable('x2', lowBound=0, upBound=7, cat='Continuous')
x3 = pulp.LpVariable('x3', lowBound=0, upBound=7, cat='Continuous')

# 3.设置目标函数
MyProbLP += 2 * x1 + 3 * x2 - 5 * x3

# 4.添加约束条件
MyProbLP += (2 * x1 - 5 * x2 + x3 >= 10) # 不等式约束
MyProbLP += (x1 + 3 * x2 + x3 <= 12) # 不等式约束
MyProbLP += (x1 + x2 + x3 == 7) # 等式约束

# 5.求解
MyProbLP.solve()
print("Status:", pulp.LpStatus[MyProbLP.status]) # 输出求解状态
for v in MyProbLP.variables():
print(v.name, "=", v.varValue) # 输出每个变量的最优值
print("F(x) = ", pulp.value(MyProbLP.objective)) # 输出最优解的目标函数值
  1. scipy求解

    A,Aeq是二维数组,其他是一维数组。

1
2
3
4
5
6
7
8
9
10
11
12
from scipy import optimi ze
import numpy as np

c=np.array([2,3,-5])
A=np.array([[-2,5,-1],[1,3,1]])
B=np.array([-10,12])
Aeq=np.array([[1,1,1]])
Beq=np.array([7])

#求解函数
res =optimize.linprog (C, A,b, Aeq, beq, LB, UB, X0, OPTIONS)
print (res)

摘录:

线性规划和例题

二、相关性分析

其实用Excel也能做:

1
=PEARSON(A2:A33)

用python生成热力图:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df=pd.read_excel('附件1')
df
df.corr("pearson")
df.corr("spearman")
df.corr("kendall")

plt.rcParams['font.sans-serif']='Simhei' #修改字体
ax=sns.heatmap(
df.corr("pearson"),
annot=True,
cmap="coolwarm",
fmt='.2f'
)
bottom,top=ax.get_ylim()
ax.set_ylim(bottom+0.5,top-0.5)
plt.savfig()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import numpy as np
from scipy.stats import spearmanr
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 数据加载
data = pd.read_excel('use_for_py.xlsx')
data_analysis = data.head(391)

# 计算相关系数矩阵
correlation_matrix = data_analysis.corr(method='spearman')

# 绘制热力图
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5, fmt=".2f")
plt.title('Spearman Correlation Heatmap')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.savefig('Corr_heat.png', dpi=300, bbox_inches='tight')
plt.show()

Pearson相关分析:(p∈[-1,1])

$\rho{X, Y} = \frac{\operatorname{cov}(X, Y)}{\sigma{X} \sigma{Y}} = \frac{E\left(\left(X-\mu{X}\right)\left(Y-\mu{Y}\right)\right)}{\sigma{X} \sigma{Y}} = \frac{E(X Y)-E(X) E(Y)}{\sqrt{E\left(X^{2}\right)-E^{2}(X)} \sqrt{E\left(Y^{2}\right)-E^{2}(Y)}} \\rho{X, Y} = \frac{N \sum X Y-\sum X \sum Y}{\sqrt{N \sum X^{2}-\left(\sum X\right)^{2}} \sqrt{N \sum Y^{2}-\left(\sum Y\right)^{2}}} \\rho_{X, Y} = \frac{\sum X Y-\frac{\sum X \sum Y}{N}}{\sqrt{\left(\sum X^{2}-\frac{\left(\sum X\right)^{2}}{N}\right)\left(\sum Y^{2}-\frac{\left(\sum Y\right)^{2}}{N}\right)}}$

t-检验:检验两个变量是否存在差异

如果满足两个假设:

​ ①正态分布:

​ 被测量的变量需要在总体和样本中呈现正态分布;

​ 即使不满足,根据中心极限定理,若每组样本大于30时,均值分布趋近于正态分布。

​ ②方差齐性:

​ 需要两样本之间的方差不能差太多。

可以用T检验(t-test),计算T值,若大于临界值,则能说明有差异性。

此外,还可以考虑用 配对样本t检验(Dependent t-test for paired samples) 和 单样本t检验(One-sample t-test)。

摘录

相关性分析-简单讲解

T-检验等统计方法速览

spsspro相关性分析

三、数据处理和机器学习

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from pandas import read_csv
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

#导入数据集
filename = 'C:/Users/Lenovo/Desktop/上课/python机器学习课程/5.13/iris.data.csv'
names = ['separ-length', 'separ-width', 'petal-length', 'patal-width', 'class']
dateset = read_csv(filename, names=names)
print('数据维度:行 %s, 列 %s' % dateset.shape) #行&列

#查看一下数据集
print(dateset.head(10))

#描述性统计
print(dateset.describe())
#数据分布情况
print(dateset.groupby('class').size())

#数据可视化
dateset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
pyplot.show() #箱型图

dateset.hist()
pyplot.show() #直方图

scatter_matrix(dateset)
pyplot.show() #散点矩阵图

#分离数据集
array = dateset.values
X = array[:, 0:4]
Y = array[:, 4]
validation_size = 0.2
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed, shuffle=True)

#算法
models = {}
models['LR'] = LogisticRegression()
models['KNN'] = KNeighborsClassifier()
models['LDA'] = LinearDiscriminantAnalysis()
models['CART'] = DecisionTreeClassifier()
models['SVM'] = SVC()
models['NB'] = GaussianNB()

results = []
for key in models:
kflod = KFold(n_splits=10, random_state=seed, shuffle=True)
cv_results = cross_val_score(models[key], X_train, Y_train, cv=kflod, scoring='accuracy')
results.append(cv_results)
print('%s: %f (%f)' %(key, cv_results.mean(), cv_results.std()))

svm = SVC()
svm.fit(X=X_train, y=Y_train)
predictions = svm.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

数据导入#三种方式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#csv特征:,分隔 文件头:字段属性
from csv import reader
import numpy as np
filename = 'pima_data.csv'
with open(filename, 'rt') as raw_data:
readers = reader(raw_data, delimiter=',')
x = list(readers)
data = np.array(x).astype('float')
print(data.shape)

#Pandas导入
from pandas import read_csv
filename = 'pima_data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
print(data.shape)

#numpy导入
from numpy import loadtxt
filename = 'pima_data.csv'
with open(filename, 'rt') as raw_data:
data = loadtxt(raw_data, delimiter=',')
print(data.shape)

数据理解

1
2
3
4
5
6
7
8
9
10
11
12
13
print(data,head(10))
#维度
print(data.shape)
#属性和类型
print(data.dtypes)
#描述统计,放入论文
print(data.describe())
#分布,用于分类算法
print(data.groupby('class').size())
#相关性:'pearson' 'spearman' 'kendall'
print(data.corr(method='pearson'))
#高斯分布
print(data.skew())

数据可视化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
data.hist()	#直方图
data.plot(kind='desnity', subplots=True, layout(3,3), shareX=False) #密度图
data.plot(kind='box', subplots=True, layout(3,3), shareX=False) #箱型图

#散点矩阵图
correlations = data.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,9,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()

#散点图
data = read_csv(filename, names=names)
scatter_matrix(data)
plt.show()

数据预处理:将数据转换到[0,1].

①数据缩放( MinMaxScaler )

​ 使用算法: K近邻算法

1
2
3
4
5
6
7
8
9
10
11
12
from sklearn.preprocessing import MinMaxScaler
from numpy import set_printoptions
from pandas import read_csv

data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
transformer = MinMaxScaler(feature_range=(0, 1))
newX = transformer.fit_transform(X)
set_printoptions(precision=3)
print(newX)

②正态化数据(StandardScaler)

​ 使用算法:线性回归、逻辑回归、判别分析

1
2
3
4
5
6
from sklearn.preprocessing import StandardScaler

transformer = StandardScaler().fit(X)
newX = transformer.transform(X)
set_printoptions(precision=3)
print(newX)

③标准化数据(Normalizer)常用

​ 使用模型:神经网络、K近邻算法

1
2
3
4
5
6
from sklearn.preprocessing import Normalizer

transformer = Normalizer().fit(X)
newX = transformer.transform(X)
set_printoptions(precision=3)
print(newX)

④二值数据(Binarizer)

1
2
3
4
5
6
from sklearn.preprocessing import Binarizer

transformer = Binarizer(threshold=0.0).fit(X)
newX = transformer.transform(X)
set_printoptions(precision=3)
print(newX)

数据特征选定

①单变量特征选定(SelectKBest类)

​ 理论:经典的卡方检验是检验定性自变量对定性自变量的相关性的方法。

​ 多用于检验

1
2
3
4
5
6
7
8
9
10
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
print(features)

②递归特征消除 常用

​ 理论:用基模型筛选特征

1
2
3
4
5
6
7
8
9
10
11
12
13
#逻辑回归为例
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
rfe = RFE(model, n_features_to_select=3)
fit = rfe.fit(X,Y)
print('特征个数:')
print(fit.n_features_)
print('被选特个数:')
print(fit.support_)
print('特征排名:')
print(fit.ranking_)

③主成分分析

​ 理论:主成分分析(PCA) 是使用线性代数来转换压缩数据,通常被称作数据降维,常见的数据降维方法除了PCA(无监督的降维方法),还有LDA (线性判别分析,有监督的降维方法),它本身也是一个分类模型。

​ 谨慎使用,合并后的新特征需要解释其意义。

1
2
3
4
5
6
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
fit = pca.fit(X)
print('解释方差:%s' % fit.explained_variance_ratio_)
print(fit.components_)

④特征重要性

​ 理论:袋装随机树、随机森林、极端随机算法。

1
2
3
4
5
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
fit = model.fit(X, Y)
print(fit.feature_importances_)

四、模型

​ 机器学习的解题步骤:

定义问题:类库导入、数据导入;
理解数据:描述性统计、数据可视化;
数据准备:数据清洗、特征选取、数据转换;
评估算法:分离数据、定义模型评估标准、算法审查、算法比较;
优化模型:调参、集合算法;
结果部署:验证、生成。

(1)评估算法

①分离数据集和评估数据集

1
2
3
4
5
6
7
8
9
10
11
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

test_size = 0.33
seed = 4
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = LogisticRegression()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print('算法评估结果: %.3f %%' % (result * 100) )

②k折交叉验证分离

​ 为了提高模型准确率。

​ K折交叉验证是将原始数据分成K组(一般是均分),将每个子集数据分别做一次验证集,其余的K-1组子集数 据作为训练集,这样会得到K个模型,再用这K个模型最终的验证集的分类准确率的平均数, 作为此K折交叉验证下分类器的性能指标。K一般大于等于2,实际操作时一般从3开始取值, 只有在原始数据集和数据量小的时候才会尝试取2。K折交叉验证可以有效地避免过学习及欠学习状态的发生,最后得到的结果也比较具有说服力。通常情况下,K的取值为3、5、10.

1
2
3
4
5
6
7
8
9
10
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

num_fold = 10
seed = 7
kfold = KFold(n_splits=num_fold, random_state=seed, shuffle=True)
model = LogisticRegression(multi_class='multinomial', max_iter=3000)
result = cross_val_score(model, X, Y, cv=kfold)
print("算法结果: %.3f%% (%.3f%%)" % (result.mean() *100, result.std() *100))

③弃一交叉验证分离.

​ 准确率高,但计算成本高。

​ 如果原始数据有N个样本,那么弃一交叉验证就是N-1个交叉验证,即每个样本单独作为验证 集,其余的N-1个样本作为训练集,所以弃一交叉验证会得到N个模型,用这N个模型最终的 验证集的分类准确率的平均数作为此次弃一交叉验证分类器的性能指标。

1
2
3
4
5
6
7
8
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

loocv = LeaveOneOut()
model = LogisticRegression(multi_class='multinomial', max_iter=1100)
result = cross_val_score(model, X, Y, cv=loocv)
print("算法评估:%.3f%% (%.3f%%)" % (result.mean()*100, result.std()*100))

④重复随机评估、训练数据集分离。

1
2
3
4
5
6
7
8
9
10
11
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

n_splits = 10
test_size = 0.33
seed = 7
kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)
model = LogisticRegression(multi_class='multinomial', max_iter=1100)
result = cross_val_score(model, X, Y, cv=kfold)
print("算法评估:%.3f%% (%.3f%%)" % (result.mean()*100, result.std()*100))

(2)算法评估

​ 寻找最佳的子集算法。重点工作在评估算法和准备数据上,要找到3-5种准确度足够的算法。

①分类准确度

​ 分类准确度就是算法自动分类正确的样本数除以所有的样本数得出的结果。

1
2
result = cross_val_score(model, X, Y, cv=kfold)
print("算法评估:%.3f%% (%.3f%%)" % (result.mean()*100, result.std()*100))

②对数损失函数

​ 分类准确度就是算法自动分类正确的样本数除以所有的样本数得出的结果。

1
2
3
4
5
6
7
8
num_flods = 10
seed = 7
kfold = KFold(n_splits=num_flods, random_state=seed, shuffle=True)
model = LogisticRegression(multi_class='multinomial', max_iter=1100)

Scoring = 'neg_log_loss'
result = cross_val_score(model, X, Y, cv=kfold, scoring=Scoring)
print('LogLoss: %.3f (%.3f) ' % (result.mean(), result.std()))

④AUC图

​ ROC和AUC是评价分类器的指标。ROC是受试者工作特征曲线(Receiver OperatingCharacteristic Curve)的简写,又称为感受性曲线(Sensitivity Curve)。得此名的原 因在于曲线上各点反映相同的感受性,它们都是对同一信号刺激的反应,只不过是在几种不 同的判定标准下所得的结果而已。ROC是反映敏感性和特异性连续变量的综合指标,用构图 法揭示敏感性和特异性的相互关系,通过将连续变量设定出多个不同的临界值计算出一系列 敏感性和特异性,再以敏感性为纵坐标、(1-特异性)为横坐标绘制成曲线。AUC是ROC曲线 下的面积(Area Under ROC Curve)的简称,顾名思义,AUC的值就是处于ROC Curve下方的 那部分面积的大小。通常,AUC的值介于0.5到1.0之间,AUC的值越大,诊断准确性越高。 在ROC曲线上,靠近坐标图左上方的点为敏感性和特异性均较高的临界值。

​ 召回率=TP/(TP+FN),召回率(Recall)又叫敏感性(sensitivity)

​ 特异度(Specificity) = TN/(FP+TN)

1
2
3
4
5
6
7
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

scoring = 'roc_auc'
result = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print('AUC %.3f (%.3f)' %(result.mean(), result.std()))

⑤混淆矩阵

​ 混淆矩阵的每一列代表了预测类别,每一列的总数表示预测为该类别的数据的数目

1
2
3
4
5
6
7
8
9
10
11
12
13
from sklearn.metrics import confusion_matrix

test_size = 0.33
seed = 4
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = LogisticRegression(multi_class='multinomial', max_iter=1100)

model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
classes = ['0', '1']
dataframe = pd.DataFrame(data=matrix, index=classes, columns=classes)
print(dataframe)

⑥结果报告

​ 精确率(precison): P = TP / (TP + FP)

​ 召回率(recall): P = TP / (TP + FN)

​ F1值:同时兼顾了分类模型的精确率和召回率, F1分数可以看作是模型准确率和召 回率的一种加权平均,它的最大值是1,最小值是0,值越大意味着模型越好。

1
2
3
4
5
from sklearn.metrics import classification_report

predicted = model.predict(X_test)
report = classification_report(Y_test, predicted)
print(report)

⑦绝对均值误差

1
2
3
scoring = 'neg_mean_absolute_error'
result = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print('MAE: %.3f (%.3f)' %(result.mean(), result.std()))

⑧决定系数($𝑅^2$)

​ 决定系数($𝑅^2$) 决定系数,反映因变量的全部变异能通过回归关系被自变量解释的比例。拟合优度越大,自变 量对因变量的解释程度越高,自变量引起的变动占总变动的百分比越高,观察点在回归直线附 近越密集。如𝑅𝑅2为0.8,则表示回归关系可以解释因变量80%的变异。换句话说,如果我们能 控制自变量不变,则因变量的变异程度会减少80%。 决定系数( 𝑅𝑅2 )的特点: 可决系数是非负的统计量。 可决系数的取值范围:0≤ 𝑅𝑅2 ≤1 可决系数是样本观测值的函数,是因随机抽样而变动的随机变量。为此,对可决系数的统计的 可靠性也应进行检验。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
```



## (3)算法

### 分类算法:

● 线性算法

​ 逻辑回归 ```LogisticRegression(multi_class='multinomial', max_iter=3000)```

​ 线性判别分析```LinearDiscriminantAnalysis```

​ ● 非线性算法

​ K近邻 ```KNeighborsClassifier```

​ 贝叶斯分类器 ```GaussianNB```

​ 分类与回归树 ```DecisionTreeClassifier```

​ 支持向量```SVC```

### 回归算法:

​ ● 线性回归``` LinearRegression```

​ ● 岭回归



## (4)优化

### ①装袋(Bagging)算法

​ 先将训练集分离成多个子集,然后通过各个子集训练多个模型。装袋算法是一种提高分类准确率的算法,通过给定组合投票的方式获得最优解。

​ **1. 装袋决策树(Bagged Decision Trees)**

​ 装袋决策树装袋算法在**数据具有很大的方差**时非常有效,最常见的例子就是决策树的装袋算法。

```python
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)

cart = DecisionTreeClassifier()
num_tree = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_tree, random_state=seed)

result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())

2. 随机森林(Random Forest)

​ 我的理解:每一棵决策树就是一个精通某一个领域的专家,这样在随机森林中就有了很多个 精通不同领域的专家,对于一个新的问题(新的输入数据),可以从不同的角度去看待它,最 终由各个专家投票得到结果。

1
2
3
4
5
from sklearn.ensemble import RandomForestClassifier

num_tree = 100
max_features = 3
model = RandomForestClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features)

3. 极端随机树(Extra Trees)

​ 随机森林应用的是Bagging模型,而极端随机树是使用所有的训练样本得到每棵决策 树,也就是每棵决策树应用的是相同的全部训练样本。

​ 随机森林应用的是Bagging模型,而极端随机树是使用所有的训练样本得到每棵决策 树,也就是每棵决策树应用的是相同的全部训练样本。

1
2
3
4
5
from sklearn.ensemble import ExtraTreesClassifier

num_tree = 100
max_features = 7
model = ExtraTreesClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features)

②提升(Boosting)算法

​ 提升算法是一种用来提高弱分类(分类不明显)算法准确度的方法,这种方法先构造一个预测函数系列, 然后以一定的方式将它们组合成一个预测函数。

1.AdaBoost

​ 一种迭代算法,其核心思想是针对同一个训练集训练不同的分类器 (弱分类器),然后把这些弱分类器集合起来,构成一个更强的最终分类器(强分类器).其算法本身是通过改变数据分布来实现的,它根据每次训练集中每个样本的分类是否正 确,以及上次的总体分类的准确率,来确定每个样本的权值。它将修改过权值的新数据集送给下层分类器进行训练,再将每次训练得到的分类器融合 起来,作为最后的决策分类器。使用AdaBoost分类器可以排除一些不必要的训练数据特征,并放在关键的训练数据上面。

1
2
3
4
from sklearn.ensemble import AdaBoostClassifier

num_tree = 30
model = AdaBoostClassifier(n_estimators=num_tree, random_state=seed)

2. 随机梯度提升(Stochastic Gradient Boosting)

​ 要找到某个函数的最大值,最好的办法就是沿着该函数的梯度方向探寻。 梯度算子总是指向函数值增长最快的方向。由于梯度提升算法在每次更新数据集时都需要遍历整个数据集,计算复杂度较高,于是 有了一个改进算法一随机梯度提升算法,该算法一次只用一个样本点来更新回归系数, 极大地改善了算法的计算复杂度。

1
2
3
4
from sklearn.ensemble import GradientBoostingClassifier

num_tree = 100
model = GradientBoostingClassifier(n_estimators=num_tree, random_state=seed)

③投票(Voting)算法

​ 是一个非常简单的多个机器学习算法的集成算法。投票算法是通过创建两个或多个算法模型,利用投票算法将这些算法包装起来,计算各 个子模型的平均预测状况。在实际的应用中,可以对每个子模型的预测结果增加权重,以提高算法的准确度。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

cart = DecisionTreeClassifier()
models = []
model_logistic = LogisticRegression()
models.append(('logistic', model_logistic))
model_cart = DecisionTreeClassifier()
models.append(('cart', model_cart))
model_svc = SVC()
models.append(('svm', model_svc))
ensemble_model = VotingClassifier(estimators=models)
result = cross_val_score(ensemble_model, X, Y , cv=kfold)
print(result.mean())

(5)调参

​ 目的:提高稳定性,减小偏差和方差。

​ 参数有两类:准确度&防止过拟合。

①网络搜索优化调参

​ 通过遍历已定义参数的列表,来评 估算法的参数,从而找到最优参数。但是,只在自己设置的超参数里选取。一般用于参数<3。

1
2
3
4
5
6
7
8
9
10
11
12
13
from pandas import read_csv
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

X = array[:, 0:8]
Y = array[:, 8]
model = Ridge()
param_grid = {'alpha' : [1, 0.1, 0.01, 0.001, 0]} #设置遍历的参数
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid.fit(X, Y)
#结果
print('最高得分: %.3f' % grid.best_score_)
print('最优参数:%s' % grid.best_estimator_.alpha)

②随机搜索优化调参

​ 通过固定次数的迭代,采用随机采样分布的方式搜索合适的参数。

​ 与网格搜索优化参数相比,随机搜索优化参数提供了一种更高效的解决方法(特别是在 参数数量多的情况下),随机搜索优化参数为每个参数定义了一个分布函数,并在该空 间中采样。

​ 一般用于参数>3。

1
2
3
4
5
6
7
8
9
10
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

model = Ridge()
param_grid = {'alpha': uniform()} #设置遍历的参数
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100, random_state=7)
grid.fit(X, Y)
print('最高得分:%.3f' % grid.best_score_)
print('最优参数:%s' % grid.best_estimator_.alpha)

(6)实战

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#导入类库
import numpy as np
from numpy import arange
from matplotlib import pyplot
from pandas import read_csv
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error

#导入数据
filename = 'housing.csv'
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
'DIS','RAD', 'TAX', 'PRTATIO', 'B', 'LSTAT', 'MEDV']
data = read_csv(filename, names=names, delim_whitespace=True)
print(data.shape)

print(data.dtypes)

print(data.head(30))

print(data.describe())

print(data.corr(method='pearson'))

#数据的可视化
data.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1)
pyplot.show()
data.plot(kind='density', subplots=True, layout=(4, 4), sharex=False, fontsize=1)
pyplot.show()

data.plot(kind='box', subplots=True, layout=(4, 4),
sharex=False, sharey=False, fontsize=8)
pyplot.show()

#散点图
scatter_matrix(data)
pyplot.show()
# 矩阵图
fig = pyplot.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(data.corr(), vmin=-1, vmax=1, interpolation='none')
fig.colorbar(cax)
ticks = np.arange(0, 14, 1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
pyplot.show()

#特征选择, 标准化数据, 正太化数据
#数据分离
array = data.values
X = array[:, 0:13]
Y = array[:, 13]
validation_size = 0.2
seed = 7
num_folds = 10
scoring = 'neg_mean_squared_error'
X_train, X_validation, Y_train, Y_validation =train_test_split(X, Y,
test_size=validation_size,
random_state=seed)

#评估算法
num_folds = 10
seed = 7
scoring = 'neg_mean_squared_error'
models = {}
models['LR'] = LinearRegression()
models['LASSO'] = Lasso()
models['EN'] = ElasticNet()
models['KNN'] = KNeighborsRegressor()
models['CART'] = DecisionTreeRegressor()
models['SVM'] = SVR()
results = []
for key in models:
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
cv_result = cross_val_score(models[key], X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_result)
print('%s: %f(%f)' %(key, cv_result.mean(), cv_result.std()))

fig = pyplot.figure()
fig.suptitle('Algorithm Comparision')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(models.keys())
pyplot.show()




# 评估算法 - 正态化数据
pipelines = {}
pipelines['ScalerLR'] = Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])
pipelines['ScalerLASSO'] = Pipeline([('Scaler', StandardScaler()), ('LASSO', Lasso())])
pipelines['ScalerEN'] = Pipeline([('Scaler', StandardScaler()), ('EN', ElasticNet())])
pipelines['ScalerKNN'] = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())])
pipelines['ScalerCART'] = Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor())])
pipelines['ScalerSVM'] = Pipeline([('Scaler', StandardScaler()), ('SVM', SVR())])
results = []
for key in pipelines:
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
cv_result = cross_val_score(pipelines[key], X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_result)
print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))
#评估算法 - 箱线图
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(pipelines.keys())
pyplot.show()

#调参
scaler = StandardScaler().fit(X_train)
rescalerX = scaler.transform(X_train)

# rescaler = StandardScaler().fit_transform(X_train)
para_grid = {'n_neighbores': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}

model = KNeighborsRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=para_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=rescalerX, y=Y_train)
print('最优: %s 使用%s' %(grid_result.best_score_, grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],
grid_result.cv_results_['std_test_score'],
grid_result.cv_results_['params'])
for mean, std, param in cv_results:
print('%f(%f) with %r' %(mean, std, param))

scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}
model = KNeighborsRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)

print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],
grid_result.cv_results_['std_test_score'],
grid_result.cv_results_['params'])
for mean, std, param in cv_results:
print('%f (%f) with %r' % (mean, std, param))


#集成算法
ensembles = {}
ensembles['ScaledAB'] = Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostRegressor())])
ensembles['ScaledAB-KNN'] = Pipeline([('Scaler', StandardScaler()),
('ABKNN', AdaBoostRegressor(base_estimator=KNeighborsRegressor(n_neighbors=1)))])
ensembles['ScaledAB-LR'] = Pipeline([('Scaler', StandardScaler()),
('ABLR', AdaBoostRegressor(LinearRegression()))])
ensembles['ScaledRFR'] = Pipeline([('Scaler', StandardScaler()),
('RFR', RandomForestRegressor())])
ensembles['ScaledETR'] = Pipeline([('Scaler', StandardScaler()),
('ETR', ExtraTreesRegressor())])
ensembles['ScaledGBR'] = Pipeline([('Scaler', StandardScaler()),
('RBR', GradientBoostingRegressor())])
results = []
for key in ensembles:
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
cv_result = cross_val_score(ensembles[key], X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_result)
print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))

#集成算法——箱线图
fig = pyplot.figure()
fig.suptitle('Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(ensembles.keys())
pyplot.show()

#集成算法调参 n_estimators
caler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_estimators':[10, 50, 100, 200, 300, 400, 500, 600,
700, 800, 900, 950]}
model = GradientBoostingRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('最优: %s 使用 %s' % (grid_result.best_score_, grid_result.best_params_))

#集成算法——ET调参
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_estimators':[10, 50, 100, 200, 300, 400, 500, 600,
700, 800, 900, 950]}
model = ExtraTreesRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('最优: %s 使用 %s' % (grid_result.best_score_, grid_result.best_params_))

#模型的确立
caler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
gbr = ExtraTreesRegressor(n_estimators=600)
gbr.fit(X=rescaledX, y=Y_train)
#评估算法
rescaledX_validation = scaler.transform(X_validation)
predictions = gbr.predict(rescaledX_validation)
print(mean_squared_error(Y_validation, predictions))
x_new =[1, 2, ]

参考资料:

Python数学建模与分析:基础入门、数据处理、算法编程、高级绘图、建模实战!bilibili

五、绘图

(1)箱型图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
data = pandas.read_excel('time.xlsx' , index_col='日期')

data.boxplot() #简单方式

data.boxplot(sym='r*',vert=False,patch_artist=True,meanline=False,showmeans=True) #横着

plt.boxplot(x = data, # 指定绘制箱线图的数据
whis = 1.5, # 指定1.5倍的四分位差
widths = 0.7, # 指定箱线图的宽度为0.8
patch_artist = True, # 指定需要填充箱体颜色
showmeans = True, # 指定需要显示均值
boxprops = {'facecolor':'steelblue'}, # 指定箱体的填充色为铁蓝色
# 指定异常点的填充色、边框色和大小
flierprops = {'markerfacecolor':'red', 'markeredgecolor':'red', 'markersize':4},
# 指定均值点的标记符号(菱形)、填充色和大小
meanprops = {'marker':'D','markerfacecolor':'black', 'markersize':4},
medianprops = {'linestyle':'--','color':'orange'}, # 指定中位数的标记符号(虚线)和颜色
labels = [''] # 去除箱线图的x轴刻度值
)
# 显示图形
plt.show()
1
2
3
4
5
6
7
8
9
10
11
#删除异常值
# 计算下四分位数和上四分位
Q1 = data['销量'].quantile(q = 0.25)
Q3 = data['销量'].quantile(q = 0.75)

# 基于1.5倍的四分位差计算上下须对应的值
low_whisker = Q1 - 1.5*(Q3 - Q1)
up_whisker = Q3 + 1.5*(Q3 - Q1)

# 寻找异常点
data['销量'][(data['销量'] > up_whisker) | (data['销量'] < low_whisker)]

(2)正太分布图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

# 读入外部数据
pay_ratio = pd.read_excel(r'time.xlsx')

plt.figure(figsize=(35, 6))
# 绘制单条折线图,并在折线图的基础上添加点图
plt.plot(pay_ratio['日期'], pay_ratio['销量'],
linestyle='-', linewidth=2, color='steelblue',
marker='o', markersize=4, markeredgecolor='black', markerfacecolor='black')

# 添加上下界的水平参考线
plt.axhline(y=pay_ratio['销量'].mean() - 2 * pay_ratio['销量'].std(), linestyle='--', color='gray')
plt.axhline(y=pay_ratio['销量'].mean() + 2 * pay_ratio['销量'].std(), linestyle='--', color='gray')

# 设置日期的显示格式
date_format = mpl.dates.DateFormatter("%m-%d")
ax = plt.gca()
ax.xaxis.set_major_formatter(date_format)

# 设置x轴每个刻度的间隔天数
xlocator = mpl.ticker.MultipleLocator(7)
ax.xaxis.set_major_locator(xlocator)

# 为了避免x轴刻度标签的紧凑,将刻度标签旋转45度
plt.xticks(rotation=45)

# 显示图形
plt.show()

筛异常点

1
2
3
4
5
6
7
8
9
10
11
12
# 计算判断异常点和极端异常点的临界值
outlier_ll = pay_ratio['销量'].mean() - 2 * pay_ratio['销量'].std()
outlier_ul = pay_ratio['销量'].mean() + 2 * pay_ratio['销量'].std()

extreme_outlier_ll = pay_ratio['销量'].mean() - 3 * pay_ratio['销量'].std()
extreme_outlier_ul = pay_ratio['销量'].mean() + 3 * pay_ratio['销量'].std()

# 寻找异常点
pay_ratio.loc[(pay_ratio['销量'] > outlier_ul) | (pay_ratio['销量'] < outlier_ll), ['日期', '销量']]

# 寻找极端异常点
pay_ratio.loc[(pay_ratio['销量'] > extreme_outlier_ul) | (pay_ratio['销量'] < extreme_outlier_ll), ['日期', '销量']]

六、聚类分析(无监督分析)

(1)k-means

​ 如果K值未知,可采用肘部法选择K值(假设最大分类数为9类,分别计算分类结果为1-9类的平均离差,离差的提升变化下降最抖时的值为最优聚类数K)

​ //纯数组X作为输入

1
2
3
4
[[1.4889993  4.18741329]
[3.95221785 3.76674812]
[4.09826192 3.95063903]
[3.65208848 4.44383585]]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import pandas as pd

data=pd.read_excel('data.xlsx',header=0).iloc[:501,3:5]
X=np.array(data)

K=range(1,10)
meanDispersions=[]
for k in K:
kemans=KMeans(n_clusters=k)
kemans.fit(X)
meanDispersions.append(sum(np.min(cdist(X,kemans.cluster_centers_,'euclidean'),axis=1))/X.shape[0])

plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.plot(K,meanDispersions,'bx-')
plt.xlabel('k')
plt.ylabel('平均离差')
plt.title('用肘部方法选择K值')
plt.show()

# 具体聚类过程
kmeans=KMeans(n_clusters=3)
result=kmeans.fit_predict(X)
print(result)
x=[i[0] for i in X]
y=[i[1] for i in X]
plt.scatter(x,y,c=result,marker='o')
plt.xlabel('x')
plt.ylabel('y')
plt.title('title')

print('聚类标签', kmeans.labels_)
print('聚类中心',kmeans.cluster_centers_)
print('聚类准则总和',kmeans.inertia_)

​ 聚类有效性评价: Rand指数、轮廓系数(Silhouette Coefficient)、Calinski-Harabaz .

​ 轮廓系数∈[-1,1],越大表示簇间相似度高而不同簇相似度低,即聚类效果越好。

1
2
3
4
5
6
7
8
9
10
11
from sklearn.metrics import silhouette_samples 

y=[]
for n in range(2,23):
kmeans=KMeans(n_clusters=n)
kmeans.fit(data)
label=kmeans.labels_
lkxs=silhouette_samples(data,label,metric='euclidean')
means=np.mean(lkxs)
y.append(means)
print(y)

参考:

用python实现聚类分析聚类分析python代码米法·的博客-CSDN博客

十种常用聚类算法(python完整代码演示)python 聚类小刘研CV的博客-CSDN博客

七、评价模型

(1)Topsis

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import numpy as np
import xlrd
import pandas as pd

#读取数据
def read(file):
wb = xlrd.open_workbook(filename=file)#打开文件
sheet = wb.sheet_by_index(0)#通过索引获取表格
rows = sheet.nrows # 获取行数
all_content = [] #存放读取的数据
for j in range(1, 5): #取第1~第4列对的数据
temp = []
for i in range(1,rows) :
cell = sheet.cell_value(i, j) #获取数据
temp.append(cell)
all_content.append(temp) #按列添加到结果集中
temp = []
return np.array(all_content)

#极小型指标 -> 极大型指标
def dataDirection_1(datas):
return np.max(datas)-datas #套公式

#中间型指标 -> 极大型指标
def dataDirection_2(datas, x_best):
temp_datas = datas - x_best
M = np.max(abs(temp_datas))
answer_datas = 1 - abs(datas - x_best) / M #套公式
return answer_datas

#区间型指标 -> 极大型指标
def dataDirection_3(datas, x_min, x_max):
M = max(x_min - np.min(datas), np.max(datas) - x_max)
answer_list = []
for i in datas:
if(i < x_min):
answer_list.append(1 - (x_min-i) /M) #套公式
elif( x_min <= i <= x_max):
answer_list.append(1)
else:
answer_list.append(1 - (i - x_max)/M)
return np.array(answer_list)

#正向化矩阵标准化
def temp2(datas):
K = np.power(np.sum(pow(datas,2),axis =1),0.5)
for i in range(0,K.size):
for j in range(0,datas[i].size):
datas[i,j] = datas[i,j] / K[i] #套用矩阵标准化的公式
return datas

#计算得分并归一化
def temp3(answer2):
list_max = np.array([np.max(answer2[0,:]),np.max(answer2[1,:]),np.max(answer2[2,:]),np.max(answer2[3,:])]) #获取每一列的最大值
list_min = np.array([np.min(answer2[0,:]),np.min(answer2[1,:]),np.min(answer2[2,:]),np.min(answer2[3,:])]) #获取每一列的最小值
max_list = [] #存放第i个评价对象与最大值的距离
min_list = [] #存放第i个评价对象与最小值的距离
answer_list=[] #存放评价对象的未归一化得分
for k in range(0,np.size(answer2,axis = 1)): #遍历每一列数据
max_sum = 0
min_sum = 0
for q in range(0,4): #有四个指标
max_sum += np.power(answer2[q,k]-list_max[q],2) #按每一列计算Di+
min_sum += np.power(answer2[q,k]-list_min[q],2) #按每一列计算Di-
max_list.append(pow(max_sum,0.5))
min_list.append(pow(min_sum,0.5))
answer_list.append(min_list[k]/ (min_list[k] + max_list[k])) #套用计算得分的公式 Si = (Di-) / ((Di+) +(Di-))
max_sum = 0
min_sum = 0
answer = np.array(answer_list) #得分归一化
return (answer / np.sum(answer))


def main():
file = 'C:\\Users\\lenovo\Desktop\\数学建模\\TOPSIS法\\第2讲.TOPSIS法(优劣解距离法)7.17\\代码和例题数据\\20条河流的水质情况数据.xlsx'
answer1 = read(file) #读取文件
answer2 = []
for i in range(0, 4): #按照不同的列,根据不同的指标转换为极大型指标,因为只有四列
answer = None
if(i == 0): #本来就是极大型指标,不用转换
answer = answer1[0]
elif(i == 1): #中间型指标
answer = dataDirection_2(answer1[1],7)
elif(i==2): #极小型指标
answer = dataDirection_1(answer1[2])
else: #范围型指标
answer = dataDirection_3(answer1[3],10,20)
answer2.append(answer)
answer2 = np.array(answer2) #将list转换为numpy数组
answer3 = temp2(answer2) #数组正向化
answer4 = temp3(answer3) #标准化处理去钢
data = pd.DataFrame(answer4) #计算得分

#将得分输出到excel表格中
writer = pd.ExcelWriter('A.xlsx') # 写入Excel文件
data.to_excel(writer, 'page_1', float_format='%.5f') # ‘page_1’是写入excel的sheet名
writer.save()
writer.close()

main()

参考:

Python实现TOPSIS分析法(优劣解距离法)_phythontopsis分析_XHHP的博客-CSDN博客

TOPSIS法(优劣解距离法)介绍及 python3 实现 - 知乎 (zhihu.com)