导入数据探索的工具包
1 2 3 4 5 6 7 8 9 10 11 import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsfrom scipy import statsimport warningswarnings.filterwarnings("ignore" ) %matplotlib inline
读取数据文件
使用Pandas库read_csv()函数进行数据读取,分割符为‘\t’
1 2 3 !wget http://tianchi-media.oss-cn-beijing.aliyuncs.com/DSW/Industrial_Steam_Forecast/zhengqi_test.txt !wget http://tianchi-media.oss-cn-beijing.aliyuncs.com/DSW/Industrial_Steam_Forecast/zhengqi_train.txt
1 2 3 4 5 train_data_file = "./zhengqi_train.txt" test_data_file = "./zhengqi_test.txt" train_data = pd.read_csv(train_data_file, sep='\t' , encoding='utf-8' ) test_data = pd.read_csv(test_data_file, sep='\t' , encoding='utf-8' )
查看训练集特征变量信息
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2888 entries, 0 to 2887
Data columns (total 39 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 V0 2888 non-null float64
1 V1 2888 non-null float64
2 V2 2888 non-null float64
3 V3 2888 non-null float64
4 V4 2888 non-null float64
5 V5 2888 non-null float64
6 V6 2888 non-null float64
7 V7 2888 non-null float64
8 V8 2888 non-null float64
9 V9 2888 non-null float64
10 V10 2888 non-null float64
11 V11 2888 non-null float64
12 V12 2888 non-null float64
13 V13 2888 non-null float64
14 V14 2888 non-null float64
15 V15 2888 non-null float64
16 V16 2888 non-null float64
17 V17 2888 non-null float64
18 V18 2888 non-null float64
19 V19 2888 non-null float64
20 V20 2888 non-null float64
21 V21 2888 non-null float64
22 V22 2888 non-null float64
23 V23 2888 non-null float64
24 V24 2888 non-null float64
25 V25 2888 non-null float64
26 V26 2888 non-null float64
27 V27 2888 non-null float64
28 V28 2888 non-null float64
29 V29 2888 non-null float64
30 V30 2888 non-null float64
31 V31 2888 non-null float64
32 V32 2888 non-null float64
33 V33 2888 non-null float64
34 V34 2888 non-null float64
35 V35 2888 non-null float64
36 V36 2888 non-null float64
37 V37 2888 non-null float64
38 target 2888 non-null float64
dtypes: float64(39)
memory usage: 880.1 KB
此训练集数据共有2888个样本,数据中有V0-V37共计38个特征变量,变量类型都为数值类型,所有数据特征没有缺失值数据; 数据字段由于采用了脱敏处理,删除了特征数据的具体含义;target字段为标签变量
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1925 entries, 0 to 1924
Data columns (total 38 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 V0 1925 non-null float64
1 V1 1925 non-null float64
2 V2 1925 non-null float64
3 V3 1925 non-null float64
4 V4 1925 non-null float64
5 V5 1925 non-null float64
6 V6 1925 non-null float64
7 V7 1925 non-null float64
8 V8 1925 non-null float64
9 V9 1925 non-null float64
10 V10 1925 non-null float64
11 V11 1925 non-null float64
12 V12 1925 non-null float64
13 V13 1925 non-null float64
14 V14 1925 non-null float64
15 V15 1925 non-null float64
16 V16 1925 non-null float64
17 V17 1925 non-null float64
18 V18 1925 non-null float64
19 V19 1925 non-null float64
20 V20 1925 non-null float64
21 V21 1925 non-null float64
22 V22 1925 non-null float64
23 V23 1925 non-null float64
24 V24 1925 non-null float64
25 V25 1925 non-null float64
26 V26 1925 non-null float64
27 V27 1925 non-null float64
28 V28 1925 non-null float64
29 V29 1925 non-null float64
30 V30 1925 non-null float64
31 V31 1925 non-null float64
32 V32 1925 non-null float64
33 V33 1925 non-null float64
34 V34 1925 non-null float64
35 V35 1925 non-null float64
36 V36 1925 non-null float64
37 V37 1925 non-null float64
dtypes: float64(38)
memory usage: 571.6 KB
测试集数据共有1925个样本,数据中有V0-V37共计38个特征变量,变量类型都为数值类型
查看数据统计信息
V0
V1
V2
V3
V4
V5
V6
V7
V8
V9
...
V29
V30
V31
V32
V33
V34
V35
V36
V37
target
count
2888.000000
2888.000000
2888.000000
2888.000000
2888.000000
2888.000000
2888.000000
2888.000000
2888.000000
2888.000000
...
2888.000000
2888.000000
2888.000000
2888.000000
2888.000000
2888.000000
2888.000000
2888.000000
2888.000000
2888.000000
mean
0.123048
0.056068
0.289720
-0.067790
0.012921
-0.558565
0.182892
0.116155
0.177856
-0.169452
...
0.097648
0.055477
0.127791
0.020806
0.007801
0.006715
0.197764
0.030658
-0.130330
0.126353
std
0.928031
0.941515
0.911236
0.970298
0.888377
0.517957
0.918054
0.955116
0.895444
0.953813
...
1.061200
0.901934
0.873028
0.902584
1.006995
1.003291
0.985675
0.970812
1.017196
0.983966
min
-4.335000
-5.122000
-3.420000
-3.956000
-4.742000
-2.182000
-4.576000
-5.048000
-4.692000
-12.891000
...
-2.912000
-4.507000
-5.859000
-4.053000
-4.627000
-4.789000
-5.695000
-2.608000
-3.630000
-3.044000
25%
-0.297000
-0.226250
-0.313000
-0.652250
-0.385000
-0.853000
-0.310000
-0.295000
-0.159000
-0.390000
...
-0.664000
-0.283000
-0.170250
-0.407250
-0.499000
-0.290000
-0.202500
-0.413000
-0.798250
-0.350250
50%
0.359000
0.272500
0.386000
-0.044500
0.110000
-0.466000
0.388000
0.344000
0.362000
0.042000
...
-0.023000
0.053500
0.299500
0.039000
-0.040000
0.160000
0.364000
0.137000
-0.185500
0.313000
75%
0.726000
0.599000
0.918250
0.624000
0.550250
-0.154000
0.831250
0.782250
0.726000
0.042000
...
0.745250
0.488000
0.635000
0.557000
0.462000
0.273000
0.602000
0.644250
0.495250
0.793250
max
2.121000
1.918000
2.828000
2.457000
2.689000
0.489000
1.895000
1.918000
2.245000
1.335000
...
4.580000
2.689000
2.013000
2.395000
5.465000
5.110000
2.324000
5.238000
3.000000
2.538000
8 rows × 39 columns
V0
V1
V2
V3
V4
V5
V6
V7
V8
V9
...
V28
V29
V30
V31
V32
V33
V34
V35
V36
V37
count
1925.000000
1925.000000
1925.000000
1925.000000
1925.000000
1925.000000
1925.000000
1925.000000
1925.000000
1925.000000
...
1925.000000
1925.000000
1925.000000
1925.000000
1925.000000
1925.000000
1925.000000
1925.000000
1925.000000
1925.000000
mean
-0.184404
-0.083912
-0.434762
0.101671
-0.019172
0.838049
-0.274092
-0.173971
-0.266709
0.255114
...
-0.206871
-0.146463
-0.083215
-0.191729
-0.030782
-0.011433
-0.009985
-0.296895
-0.046270
0.195735
std
1.073333
1.076670
0.969541
1.034925
1.147286
0.963043
1.054119
1.040101
1.085916
1.014394
...
1.064140
0.880593
1.126414
1.138454
1.130228
0.989732
0.995213
0.946896
1.040854
0.940599
min
-4.814000
-5.488000
-4.283000
-3.276000
-4.921000
-1.168000
-5.649000
-5.625000
-6.059000
-6.784000
...
-2.435000
-2.413000
-4.507000
-7.698000
-4.057000
-4.627000
-4.789000
-7.477000
-2.608000
-3.346000
25%
-0.664000
-0.451000
-0.978000
-0.644000
-0.497000
0.122000
-0.732000
-0.509000
-0.775000
-0.390000
...
-0.453000
-0.818000
-0.339000
-0.476000
-0.472000
-0.460000
-0.290000
-0.349000
-0.593000
-0.432000
50%
0.065000
0.195000
-0.267000
0.220000
0.118000
0.437000
-0.082000
0.018000
-0.004000
0.401000
...
-0.445000
-0.199000
0.010000
0.100000
0.155000
-0.040000
0.160000
-0.270000
0.083000
0.152000
75%
0.549000
0.589000
0.278000
0.793000
0.610000
1.928000
0.457000
0.515000
0.482000
0.904000
...
-0.434000
0.468000
0.447000
0.471000
0.627000
0.419000
0.273000
0.364000
0.651000
0.797000
max
2.100000
2.120000
1.946000
2.603000
4.475000
3.176000
1.528000
1.394000
2.408000
1.766000
...
4.656000
3.022000
3.139000
1.428000
2.299000
5.465000
5.110000
1.671000
2.861000
3.021000
8 rows × 38 columns
上面数据显示了数据的统计信息,例如样本数,数据的均值mean,标准差std,最小值,最大值等
查看数据字段信息
V0
V1
V2
V3
V4
V5
V6
V7
V8
V9
...
V29
V30
V31
V32
V33
V34
V35
V36
V37
target
0
0.566
0.016
-0.143
0.407
0.452
-0.901
-1.812
-2.360
-0.436
-2.114
...
0.136
0.109
-0.615
0.327
-4.627
-4.789
-5.101
-2.608
-3.508
0.175
1
0.968
0.437
0.066
0.566
0.194
-0.893
-1.566
-2.360
0.332
-2.114
...
-0.128
0.124
0.032
0.600
-0.843
0.160
0.364
-0.335
-0.730
0.676
2
1.013
0.568
0.235
0.370
0.112
-0.797
-1.367
-2.360
0.396
-2.114
...
-0.009
0.361
0.277
-0.116
-0.843
0.160
0.364
0.765
-0.589
0.633
3
0.733
0.368
0.283
0.165
0.599
-0.679
-1.200
-2.086
0.403
-2.114
...
0.015
0.417
0.279
0.603
-0.843
-0.065
0.364
0.333
-0.112
0.206
4
0.684
0.638
0.260
0.209
0.337
-0.454
-1.073
-2.086
0.314
-2.114
...
0.183
1.078
0.328
0.418
-0.843
-0.215
0.364
-0.280
-0.028
0.384
5 rows × 39 columns
上面显示训练集前5条数据的基本信息,可以看到数据都是浮点型数据,数据都是数值型连续型特征
V0
V1
V2
V3
V4
V5
V6
V7
V8
V9
...
V28
V29
V30
V31
V32
V33
V34
V35
V36
V37
0
0.368
0.380
-0.225
-0.049
0.379
0.092
0.550
0.551
0.244
0.904
...
-0.449
0.047
0.057
-0.042
0.847
0.534
-0.009
-0.190
-0.567
0.388
1
0.148
0.489
-0.247
-0.049
0.122
-0.201
0.487
0.493
-0.127
0.904
...
-0.443
0.047
0.560
0.176
0.551
0.046
-0.220
0.008
-0.294
0.104
2
-0.166
-0.062
-0.311
0.046
-0.055
0.063
0.485
0.493
-0.227
0.904
...
-0.458
-0.398
0.101
0.199
0.634
0.017
-0.234
0.008
0.373
0.569
3
0.102
0.294
-0.259
0.051
-0.183
0.148
0.474
0.504
0.010
0.904
...
-0.456
-0.398
1.007
0.137
1.042
-0.040
-0.290
0.008
-0.666
0.391
4
0.300
0.428
0.208
0.051
-0.033
0.116
0.408
0.497
0.155
0.904
...
-0.458
-0.776
0.291
0.370
0.181
-0.040
-0.290
0.008
-0.140
-0.497
5 rows × 38 columns
画箱形图探索数据
1 2 3 fig = plt.figure(figsize=(4 , 6 )) sns.boxplot(train_data['V0' ],orient="v" , width=0.5 )
<AxesSubplot:xlabel='V0'>
1 2 3 4 5 6 7 8 column = train_data.columns.tolist()[:39 ] fig = plt.figure(figsize=(20 , 40 )) for i in range (38 ): plt.subplot(13 , 3 , i + 1 ) sns.boxplot(train_data[column[i]], orient="v" , width=0.5 ) plt.ylabel(column[i], fontsize=8 ) plt.show()
查看数据分布图
查看特征变量‘V0’的数据分布直方图,并绘制Q-Q图查看数据是否近似于正态分布
1 2 3 4 5 6 plt.figure(figsize=(10 ,5 )) ax=plt.subplot(1 ,2 ,1 ) sns.distplot(train_data['V0' ],fit=stats.norm) ax=plt.subplot(1 ,2 ,2 ) res = stats.probplot(train_data['V0' ], plot=plt)
查看查看所有数据的直方图和Q-Q图,查看训练集的数据是否近似于正态分布
1 2 3 4 5 6 7 8 9 10 11 12 13 14 train_cols = 6 train_rows = len (train_data.columns) plt.figure(figsize=(4 *train_cols,4 *train_rows)) i=0 for col in train_data.columns: i+=1 ax=plt.subplot(train_rows,train_cols,i) sns.distplot(train_data[col],fit=stats.norm) i+=1 ax=plt.subplot(train_rows,train_cols,i) res = stats.probplot(train_data[col], plot=plt) plt.show()
由上面的数据分布图信息可以看出,很多特征变量(如’V1’,‘V9’,‘V24’,'V28’等)的数据分布不是正态的,数据并不跟随对角线,后续可以使用数据变换对数据进行转换。
对比同一特征变量‘V0’下,训练集数据和测试集数据的分布情况,查看数据分布是否一致
1 2 3 4 5 ax = sns.kdeplot(train_data['V0' ], color="Red" , shade=True ) ax = sns.kdeplot(test_data['V0' ], color="Blue" , shade=True ) ax.set_xlabel('V0' ) ax.set_ylabel("Frequency" ) ax = ax.legend(["train" ,"test" ])
查看所有特征变量下,训练集数据和测试集数据的分布情况,分析并寻找出数据分布不一致的特征变量。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 dist_cols = 6 dist_rows = len (test_data.columns) plt.figure(figsize=(4 *dist_cols,4 *dist_rows)) i=1 for col in test_data.columns: ax=plt.subplot(dist_rows,dist_cols,i) ax = sns.kdeplot(train_data[col], color="Red" , shade=True ) ax = sns.kdeplot(test_data[col], color="Blue" , shade=True ) ax.set_xlabel(col) ax.set_ylabel("Frequency" ) ax = ax.legend(["train" ,"test" ]) i+=1 plt.show()
查看特征’V5’, ‘V17’, ‘V28’, ‘V22’, ‘V11’, 'V9’数据的数据分布
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 drop_col = 6 drop_row = 1 plt.figure(figsize=(5 *drop_col,5 *drop_row)) i=1 for col in ["V5" ,"V9" ,"V11" ,"V17" ,"V22" ,"V28" ]: ax =plt.subplot(drop_row,drop_col,i) ax = sns.kdeplot(train_data[col], color="Red" , shade=True ) ax = sns.kdeplot(test_data[col], color="Blue" , shade=True ) ax.set_xlabel(col) ax.set_ylabel("Frequency" ) ax = ax.legend(["train" ,"test" ]) i+=1 plt.show()
由上图的数据分布可以看到特征’V5’,‘V9’,‘V11’,‘V17’,‘V22’,‘V28’ 训练集数据与测试集数据分布不一致,会导致模型泛化能力差,采用删除此类特征方法。
1 2 drop_columns = ['V5' ,'V9' ,'V11' ,'V17' ,'V22' ,'V28' ]
可视化线性回归关系
查看特征变量‘V0’与’target’变量的线性回归关系
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 fcols = 2 frows = 1 plt.figure(figsize=(8 ,4 )) ax=plt.subplot(1 ,2 ,1 ) sns.regplot(x='V0' , y='target' , data=train_data, ax=ax, scatter_kws={'marker' :'.' ,'s' :3 ,'alpha' :0.3 }, line_kws={'color' :'k' }); plt.xlabel('V0' ) plt.ylabel('target' ) ax=plt.subplot(1 ,2 ,2 ) sns.distplot(train_data['V0' ].dropna()) plt.xlabel('V0' ) plt.show()
查看所有特征变量与’target’变量的线性回归关系
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 fcols = 6 frows = len (test_data.columns) plt.figure(figsize=(5 *fcols,4 *frows)) i=0 for col in test_data.columns: i+=1 ax=plt.subplot(frows,fcols,i) sns.regplot(x=col, y='target' , data=train_data, ax=ax, scatter_kws={'marker' :'.' ,'s' :3 ,'alpha' :0.3 }, line_kws={'color' :'k' }); plt.xlabel(col) plt.ylabel('target' ) i+=1 ax=plt.subplot(frows,fcols,i) sns.distplot(train_data[col].dropna()) plt.xlabel(col)
查看特征变量的相关性
1 2 3 data_train1 = train_data.drop(['V5' ,'V9' ,'V11' ,'V17' ,'V22' ,'V28' ],axis=1 ) train_corr = data_train1.corr() train_corr
V0
V1
V2
V3
V4
V6
V7
V8
V10
V12
...
V29
V30
V31
V32
V33
V34
V35
V36
V37
target
V0
1.000000
0.908607
0.463643
0.409576
0.781212
0.189267
0.141294
0.794013
0.298443
0.751830
...
0.302145
0.156968
0.675003
0.050951
0.056439
-0.019342
0.138933
0.231417
-0.494076
0.873212
V1
0.908607
1.000000
0.506514
0.383924
0.657790
0.276805
0.205023
0.874650
0.310120
0.656186
...
0.147096
0.175997
0.769745
0.085604
0.035129
-0.029115
0.146329
0.235299
-0.494043
0.871846
V2
0.463643
0.506514
1.000000
0.410148
0.057697
0.615938
0.477114
0.703431
0.346006
0.059941
...
-0.275764
0.175943
0.653764
0.033942
0.050309
-0.025620
0.043648
0.316462
-0.734956
0.638878
V3
0.409576
0.383924
0.410148
1.000000
0.315046
0.233896
0.197836
0.411946
0.321262
0.306397
...
0.117610
0.043966
0.421954
-0.092423
-0.007159
-0.031898
0.080034
0.324475
-0.229613
0.512074
V4
0.781212
0.657790
0.057697
0.315046
1.000000
-0.117529
-0.052370
0.449542
0.141129
0.927685
...
0.659093
0.022807
0.447016
-0.026186
0.062367
0.028659
0.100010
0.113609
-0.031054
0.603984
V6
0.189267
0.276805
0.615938
0.233896
-0.117529
1.000000
0.917502
0.468233
0.415660
-0.087312
...
-0.467980
0.188907
0.546535
0.144550
0.054210
-0.002914
0.044992
0.433804
-0.404817
0.370037
V7
0.141294
0.205023
0.477114
0.197836
-0.052370
0.917502
1.000000
0.389987
0.310982
-0.036791
...
-0.311363
0.170113
0.475254
0.122707
0.034508
-0.019103
0.111166
0.340479
-0.292285
0.287815
V8
0.794013
0.874650
0.703431
0.411946
0.449542
0.468233
0.389987
1.000000
0.419703
0.420557
...
-0.011091
0.150258
0.878072
0.038430
0.026843
-0.036297
0.179167
0.326586
-0.553121
0.831904
V10
0.298443
0.310120
0.346006
0.321262
0.141129
0.415660
0.310982
0.419703
1.000000
0.140462
...
-0.105042
-0.036705
0.560213
-0.093213
0.016739
-0.026994
0.026846
0.922190
-0.045851
0.394767
V12
0.751830
0.656186
0.059941
0.306397
0.927685
-0.087312
-0.036791
0.420557
0.140462
1.000000
...
0.666775
0.028866
0.441963
-0.007658
0.046674
0.010122
0.081963
0.112150
-0.054827
0.594189
V13
0.185144
0.157518
0.204762
-0.003636
0.075993
0.138367
0.110973
0.153299
-0.059553
0.098771
...
0.008235
0.027328
0.113743
0.130598
0.157513
0.116944
0.219906
-0.024751
-0.379714
0.203373
V14
-0.004144
-0.006268
-0.106282
-0.232677
0.023853
0.072911
0.163931
0.008138
-0.077543
0.020069
...
0.056814
-0.004057
0.010989
0.106581
0.073535
0.043218
0.233523
-0.086217
0.010553
0.008424
V15
0.314520
0.164702
-0.224573
0.143457
0.615704
-0.431542
-0.291272
0.018366
-0.046737
0.642081
...
0.951314
-0.111311
0.011768
-0.104618
0.050254
0.048602
0.100817
-0.051861
0.245635
0.154020
V16
0.347357
0.435606
0.782474
0.394517
0.023818
0.847119
0.752683
0.680031
0.546975
0.025736
...
-0.342210
0.154794
0.778538
0.041474
0.028878
-0.054775
0.082293
0.551880
-0.420053
0.536748
V18
0.148622
0.123862
0.132105
0.022868
0.136022
0.110570
0.098691
0.093682
-0.024693
0.119833
...
0.053958
0.470341
0.079718
0.411967
0.512139
0.365410
0.152088
0.019603
-0.181937
0.170721
V19
-0.100294
-0.092673
-0.161802
-0.246008
-0.205729
0.215290
0.158371
-0.144693
0.074903
-0.148319
...
-0.205409
0.100133
-0.131542
0.144018
-0.021517
-0.079753
-0.220737
0.087605
0.012115
-0.114976
V20
0.462493
0.459795
0.298385
0.289594
0.291309
0.136091
0.089399
0.412868
0.207612
0.271559
...
0.016233
0.086165
0.326863
0.050699
0.009358
-0.000979
0.048981
0.161315
-0.322006
0.444965
V21
-0.029285
-0.012911
-0.030932
0.114373
0.174025
-0.051806
-0.065300
-0.047839
0.082288
0.144371
...
0.157097
-0.077945
0.053025
-0.159128
-0.087561
-0.053707
-0.199398
0.047340
0.315470
-0.010063
V23
0.231136
0.222574
0.065509
0.081374
0.196530
0.069901
0.125180
0.174124
-0.066537
0.180049
...
0.116122
0.363963
0.129783
0.367086
0.183666
0.196681
0.635252
-0.035949
-0.187582
0.226331
V24
-0.324959
-0.233556
0.010225
-0.237326
-0.529866
0.072418
-0.030292
-0.136898
-0.029420
-0.550881
...
-0.642370
0.033532
-0.202097
0.060608
-0.134320
-0.095588
-0.243738
-0.041325
-0.137614
-0.264815
V25
-0.200706
-0.070627
0.481785
-0.100569
-0.444375
0.438610
0.316744
0.173320
0.079805
-0.448877
...
-0.575154
0.088238
0.201243
0.065501
-0.013312
-0.030747
-0.093948
0.069302
-0.246742
-0.019373
V26
-0.125140
-0.043012
0.035370
-0.027685
-0.080487
0.106055
0.160566
0.015724
0.072366
-0.124111
...
-0.133694
-0.057247
0.062879
-0.004545
-0.034596
0.051294
0.085576
0.064963
0.010880
-0.046724
V27
0.733198
0.824198
0.726250
0.392006
0.412083
0.474441
0.424185
0.901100
0.246085
0.374380
...
-0.032772
0.208074
0.790239
0.095127
0.030135
-0.036123
0.159884
0.226713
-0.617771
0.812585
V29
0.302145
0.147096
-0.275764
0.117610
0.659093
-0.467980
-0.311363
-0.011091
-0.105042
0.666775
...
1.000000
-0.122817
-0.004364
-0.110699
0.035272
0.035392
0.078588
-0.099309
0.285581
0.123329
V30
0.156968
0.175997
0.175943
0.043966
0.022807
0.188907
0.170113
0.150258
-0.036705
0.028866
...
-0.122817
1.000000
0.114318
0.695725
0.083693
-0.028573
-0.027987
0.006961
-0.256814
0.187311
V31
0.675003
0.769745
0.653764
0.421954
0.447016
0.546535
0.475254
0.878072
0.560213
0.441963
...
-0.004364
0.114318
1.000000
0.016782
0.016733
-0.047273
0.152314
0.510851
-0.357785
0.750297
V32
0.050951
0.085604
0.033942
-0.092423
-0.026186
0.144550
0.122707
0.038430
-0.093213
-0.007658
...
-0.110699
0.695725
0.016782
1.000000
0.105255
0.069300
0.016901
-0.054411
-0.162417
0.066606
V33
0.056439
0.035129
0.050309
-0.007159
0.062367
0.054210
0.034508
0.026843
0.016739
0.046674
...
0.035272
0.083693
0.016733
0.105255
1.000000
0.719126
0.167597
0.031586
-0.062715
0.077273
V34
-0.019342
-0.029115
-0.025620
-0.031898
0.028659
-0.002914
-0.019103
-0.036297
-0.026994
0.010122
...
0.035392
-0.028573
-0.047273
0.069300
0.719126
1.000000
0.233616
-0.019032
-0.006854
-0.006034
V35
0.138933
0.146329
0.043648
0.080034
0.100010
0.044992
0.111166
0.179167
0.026846
0.081963
...
0.078588
-0.027987
0.152314
0.016901
0.167597
0.233616
1.000000
0.025401
-0.077991
0.140294
V36
0.231417
0.235299
0.316462
0.324475
0.113609
0.433804
0.340479
0.326586
0.922190
0.112150
...
-0.099309
0.006961
0.510851
-0.054411
0.031586
-0.019032
0.025401
1.000000
-0.039478
0.319309
V37
-0.494076
-0.494043
-0.734956
-0.229613
-0.031054
-0.404817
-0.292285
-0.553121
-0.045851
-0.054827
...
0.285581
-0.256814
-0.357785
-0.162417
-0.062715
-0.006854
-0.077991
-0.039478
1.000000
-0.565795
target
0.873212
0.871846
0.638878
0.512074
0.603984
0.370037
0.287815
0.831904
0.394767
0.594189
...
0.123329
0.187311
0.750297
0.066606
0.077273
-0.006034
0.140294
0.319309
-0.565795
1.000000
33 rows × 33 columns
1 2 3 4 5 ax = plt.subplots(figsize=(20 , 16 )) ax = sns.heatmap(train_corr, vmax=.8 , square=True , annot=True )
1 2 3 4 5 6 7 8 9 10 11 data_train1 = train_data.drop(['V5' ,'V9' ,'V11' ,'V17' ,'V22' ,'V28' ],axis=1 ) plt.figure(figsize=(20 , 16 )) colnm = data_train1.columns.tolist() mcorr = data_train1[colnm].corr(method="spearman" ) mask = np.zeros_like(mcorr, dtype=np.bool ) mask[np.triu_indices_from(mask)] = True cmap = sns.diverging_palette(220 , 10 , as_cmap=True ) g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True , annot=True , fmt='0.2f' ) plt.show()
上图为所有特征变量和target变量两两之间的相关系数,由此可以看出各个特征变量V0-V37之间的相关性以及特征变量V0-V37与target的相关性。
查找出特征变量和target变量相关系数大于0.5的特征变量
1 2 3 4 5 6 7 8 9 10 11 k = 10 cols = train_corr.nlargest(k, 'target' )['target' ].index cm = np.corrcoef(train_data[cols].values.T) hm = plt.subplots(figsize=(10 , 10 )) hm = sns.heatmap(train_data[cols].corr(),annot=True ,square=True ) plt.show()
1 2 3 4 5 6 7 threshold = 0.5 corrmat = train_data.corr() top_corr_features = corrmat.index[abs (corrmat["target" ])>threshold] plt.figure(figsize=(10 ,10 )) g = sns.heatmap(train_data[top_corr_features].corr(),annot=True ,cmap="RdYlGn" )
1 2 drop_columns.clear() drop_columns = ['V5' ,'V9' ,'V11' ,'V17' ,'V22' ,'V28' ]
1 2 3 4 5 6 7 threshold = 0.5 corr_matrix = data_train1.corr().abs () drop_col=corr_matrix[corr_matrix["target" ]<threshold].index
由于’V14’, ‘V21’, ‘V25’, ‘V26’, ‘V32’, ‘V33’, 'V34’特征的相关系数值小于0.5,故认为这些特征与最终的预测target值不相关,删除这些特征变量;
1 2 3 4 5 6 7 8 9 10 train_x = train_data.drop(['target' ], axis=1 ) data_all = pd.concat([train_x,test_data]) data_all.drop(drop_columns,axis=1 ,inplace=True ) data_all.head()
V0
V1
V2
V3
V4
V6
V7
V8
V10
V12
...
V27
V29
V30
V31
V32
V33
V34
V35
V36
V37
0
0.566
0.016
-0.143
0.407
0.452
-1.812
-2.360
-0.436
-0.940
-0.073
...
0.168
0.136
0.109
-0.615
0.327
-4.627
-4.789
-5.101
-2.608
-3.508
1
0.968
0.437
0.066
0.566
0.194
-1.566
-2.360
0.332
0.188
-0.134
...
0.338
-0.128
0.124
0.032
0.600
-0.843
0.160
0.364
-0.335
-0.730
2
1.013
0.568
0.235
0.370
0.112
-1.367
-2.360
0.396
0.874
-0.072
...
0.326
-0.009
0.361
0.277
-0.116
-0.843
0.160
0.364
0.765
-0.589
3
0.733
0.368
0.283
0.165
0.599
-1.200
-2.086
0.403
0.011
-0.014
...
0.277
0.015
0.417
0.279
0.603
-0.843
-0.065
0.364
0.333
-0.112
4
0.684
0.638
0.260
0.209
0.337
-1.073
-2.086
0.314
-0.251
0.199
...
0.332
0.183
1.078
0.328
0.418
-0.843
-0.215
0.364
-0.280
-0.028
5 rows × 32 columns
1 2 3 4 5 6 7 8 cols_numeric=list (data_all.columns) def scale_minmax (col ): return (col-col.min ())/(col.max ()-col.min ()) data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax,axis=0 ) data_all[cols_numeric].describe()
V0
V1
V2
V3
V4
V6
V7
V8
V10
V12
...
V27
V29
V30
V31
V32
V33
V34
V35
V36
V37
count
4813.000000
4813.000000
4813.000000
4813.000000
4813.000000
4813.000000
4813.000000
4813.000000
4813.000000
4813.000000
...
4813.000000
4813.000000
4813.000000
4813.000000
4813.000000
4813.000000
4813.000000
4813.000000
4813.000000
4813.000000
mean
0.694172
0.721357
0.602300
0.603139
0.523743
0.748823
0.745740
0.715607
0.348518
0.578507
...
0.881401
0.388683
0.589459
0.792709
0.628824
0.458493
0.483790
0.762873
0.332385
0.545795
std
0.144198
0.131443
0.140628
0.152462
0.106430
0.132560
0.132577
0.118105
0.134882
0.105088
...
0.128221
0.133475
0.130786
0.102976
0.155003
0.099095
0.101020
0.102037
0.127456
0.150356
min
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
...
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
25%
0.626676
0.679416
0.514414
0.503888
0.478182
0.683324
0.696938
0.664934
0.284327
0.532892
...
0.888575
0.292445
0.550092
0.761816
0.562461
0.409037
0.454490
0.727273
0.270584
0.445647
50%
0.729488
0.752497
0.617072
0.614270
0.535866
0.774125
0.771974
0.742884
0.366469
0.591635
...
0.916015
0.375734
0.594428
0.815055
0.643056
0.454518
0.499949
0.800020
0.347056
0.539317
75%
0.790195
0.799553
0.700464
0.710474
0.585036
0.842259
0.836405
0.790835
0.432965
0.641971
...
0.932555
0.471837
0.650798
0.852229
0.719777
0.500000
0.511365
0.800020
0.414861
0.643061
max
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
...
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
8 rows × 32 columns
1 2 3 4 5 6 7 train_data_process = train_data[cols_numeric] train_data_process = train_data_process[cols_numeric].apply(scale_minmax,axis=0 ) test_data_process = test_data[cols_numeric] test_data_process = test_data_process[cols_numeric].apply(scale_minmax,axis=0 )
1 2 cols_numeric_left = cols_numeric[0 :13 ] cols_numeric_right = cols_numeric[13 :]
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 train_data_process = pd.concat([train_data_process, train_data['target' ]], axis=1 ) fcols = 6 frows = len (cols_numeric_left) plt.figure(figsize=(4 *fcols,4 *frows)) i=0 for var in cols_numeric_left: dat = train_data_process[[var, 'target' ]].dropna() i+=1 plt.subplot(frows,fcols,i) sns.distplot(dat[var] , fit=stats.norm); plt.title(var+' Original' ) plt.xlabel('' ) i+=1 plt.subplot(frows,fcols,i) _=stats.probplot(dat[var], plot=plt) plt.title('skew=' +'{:.4f}' .format (stats.skew(dat[var]))) plt.xlabel('' ) plt.ylabel('' ) i+=1 plt.subplot(frows,fcols,i) plt.plot(dat[var], dat['target' ],'.' ,alpha=0.5 ) plt.title('corr=' +'{:.2f}' .format (np.corrcoef(dat[var], dat['target' ])[0 ][1 ])) i+=1 plt.subplot(frows,fcols,i) trans_var, lambda_var = stats.boxcox(dat[var].dropna()+1 ) trans_var = scale_minmax(trans_var) sns.distplot(trans_var , fit=stats.norm); plt.title(var+' Tramsformed' ) plt.xlabel('' ) i+=1 plt.subplot(frows,fcols,i) _=stats.probplot(trans_var, plot=plt) plt.title('skew=' +'{:.4f}' .format (stats.skew(trans_var))) plt.xlabel('' ) plt.ylabel('' ) i+=1 plt.subplot(frows,fcols,i) plt.plot(trans_var, dat['target' ],'.' ,alpha=0.5 ) plt.title('corr=' +'{:.2f}' .format (np.corrcoef(trans_var,dat['target' ])[0 ][1 ]))
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 fcols = 6 frows = len (cols_numeric_right) plt.figure(figsize=(4 *fcols,4 *frows)) i=0 for var in cols_numeric_right: dat = train_data_process[[var, 'target' ]].dropna() i+=1 plt.subplot(frows,fcols,i) sns.distplot(dat[var] , fit=stats.norm); plt.title(var+' Original' ) plt.xlabel('' ) i+=1 plt.subplot(frows,fcols,i) _=stats.probplot(dat[var], plot=plt) plt.title('skew=' +'{:.4f}' .format (stats.skew(dat[var]))) plt.xlabel('' ) plt.ylabel('' ) i+=1 plt.subplot(frows,fcols,i) plt.plot(dat[var], dat['target' ],'.' ,alpha=0.5 ) plt.title('corr=' +'{:.2f}' .format (np.corrcoef(dat[var], dat['target' ])[0 ][1 ])) i+=1 plt.subplot(frows,fcols,i) trans_var, lambda_var = stats.boxcox(dat[var].dropna()+1 ) trans_var = scale_minmax(trans_var) sns.distplot(trans_var , fit=stats.norm); plt.title(var+' Tramsformed' ) plt.xlabel('' ) i+=1 plt.subplot(frows,fcols,i) _=stats.probplot(trans_var, plot=plt) plt.title('skew=' +'{:.4f}' .format (stats.skew(trans_var))) plt.xlabel('' ) plt.ylabel('' ) i+=1 plt.subplot(frows,fcols,i) plt.plot(trans_var, dat['target' ],'.' ,alpha=0.5 ) plt.title('corr=' +'{:.2f}' .format (np.corrcoef(trans_var,dat['target' ])[0 ][1 ]))