导入数据探索的工具包 
1 2 3 4 5 6 7 8 9 10 11 import  numpy as  npimport  pandas as  pdimport  matplotlib.pyplot as  pltimport  seaborn as  snsfrom  scipy import  statsimport  warningswarnings.filterwarnings("ignore" )   %matplotlib inline 
 
  读取数据文件 
使用Pandas库read_csv()函数进行数据读取,分割符为‘\t’
1 2 3 !wget http://tianchi-media.oss-cn-beijing.aliyuncs.com/DSW/Industrial_Steam_Forecast/zhengqi_test.txt !wget http://tianchi-media.oss-cn-beijing.aliyuncs.com/DSW/Industrial_Steam_Forecast/zhengqi_train.txt 
 
1 2 3 4 5 train_data_file = "./zhengqi_train.txt"  test_data_file =  "./zhengqi_test.txt"  train_data = pd.read_csv(train_data_file, sep='\t' , encoding='utf-8' ) test_data = pd.read_csv(test_data_file, sep='\t' , encoding='utf-8' ) 
 
  查看训练集特征变量信息 
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2888 entries, 0 to 2887
Data columns (total 39 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V0      2888 non-null   float64
 1   V1      2888 non-null   float64
 2   V2      2888 non-null   float64
 3   V3      2888 non-null   float64
 4   V4      2888 non-null   float64
 5   V5      2888 non-null   float64
 6   V6      2888 non-null   float64
 7   V7      2888 non-null   float64
 8   V8      2888 non-null   float64
 9   V9      2888 non-null   float64
 10  V10     2888 non-null   float64
 11  V11     2888 non-null   float64
 12  V12     2888 non-null   float64
 13  V13     2888 non-null   float64
 14  V14     2888 non-null   float64
 15  V15     2888 non-null   float64
 16  V16     2888 non-null   float64
 17  V17     2888 non-null   float64
 18  V18     2888 non-null   float64
 19  V19     2888 non-null   float64
 20  V20     2888 non-null   float64
 21  V21     2888 non-null   float64
 22  V22     2888 non-null   float64
 23  V23     2888 non-null   float64
 24  V24     2888 non-null   float64
 25  V25     2888 non-null   float64
 26  V26     2888 non-null   float64
 27  V27     2888 non-null   float64
 28  V28     2888 non-null   float64
 29  V29     2888 non-null   float64
 30  V30     2888 non-null   float64
 31  V31     2888 non-null   float64
 32  V32     2888 non-null   float64
 33  V33     2888 non-null   float64
 34  V34     2888 non-null   float64
 35  V35     2888 non-null   float64
 36  V36     2888 non-null   float64
 37  V37     2888 non-null   float64
 38  target  2888 non-null   float64
dtypes: float64(39)
memory usage: 880.1 KB
 
此训练集数据共有2888个样本,数据中有V0-V37共计38个特征变量,变量类型都为数值类型,所有数据特征没有缺失值数据; 数据字段由于采用了脱敏处理,删除了特征数据的具体含义;target字段为标签变量
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1925 entries, 0 to 1924
Data columns (total 38 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V0      1925 non-null   float64
 1   V1      1925 non-null   float64
 2   V2      1925 non-null   float64
 3   V3      1925 non-null   float64
 4   V4      1925 non-null   float64
 5   V5      1925 non-null   float64
 6   V6      1925 non-null   float64
 7   V7      1925 non-null   float64
 8   V8      1925 non-null   float64
 9   V9      1925 non-null   float64
 10  V10     1925 non-null   float64
 11  V11     1925 non-null   float64
 12  V12     1925 non-null   float64
 13  V13     1925 non-null   float64
 14  V14     1925 non-null   float64
 15  V15     1925 non-null   float64
 16  V16     1925 non-null   float64
 17  V17     1925 non-null   float64
 18  V18     1925 non-null   float64
 19  V19     1925 non-null   float64
 20  V20     1925 non-null   float64
 21  V21     1925 non-null   float64
 22  V22     1925 non-null   float64
 23  V23     1925 non-null   float64
 24  V24     1925 non-null   float64
 25  V25     1925 non-null   float64
 26  V26     1925 non-null   float64
 27  V27     1925 non-null   float64
 28  V28     1925 non-null   float64
 29  V29     1925 non-null   float64
 30  V30     1925 non-null   float64
 31  V31     1925 non-null   float64
 32  V32     1925 non-null   float64
 33  V33     1925 non-null   float64
 34  V34     1925 non-null   float64
 35  V35     1925 non-null   float64
 36  V36     1925 non-null   float64
 37  V37     1925 non-null   float64
dtypes: float64(38)
memory usage: 571.6 KB
 
测试集数据共有1925个样本,数据中有V0-V37共计38个特征变量,变量类型都为数值类型
  查看数据统计信息 
 
  
    
       
      V0 
      V1 
      V2 
      V3 
      V4 
      V5 
      V6 
      V7 
      V8 
      V9 
      ... 
      V29 
      V30 
      V31 
      V32 
      V33 
      V34 
      V35 
      V36 
      V37 
      target 
     
   
  
    
      count 
      2888.000000 
      2888.000000 
      2888.000000 
      2888.000000 
      2888.000000 
      2888.000000 
      2888.000000 
      2888.000000 
      2888.000000 
      2888.000000 
      ... 
      2888.000000 
      2888.000000 
      2888.000000 
      2888.000000 
      2888.000000 
      2888.000000 
      2888.000000 
      2888.000000 
      2888.000000 
      2888.000000 
     
    
      mean 
      0.123048 
      0.056068 
      0.289720 
      -0.067790 
      0.012921 
      -0.558565 
      0.182892 
      0.116155 
      0.177856 
      -0.169452 
      ... 
      0.097648 
      0.055477 
      0.127791 
      0.020806 
      0.007801 
      0.006715 
      0.197764 
      0.030658 
      -0.130330 
      0.126353 
     
    
      std 
      0.928031 
      0.941515 
      0.911236 
      0.970298 
      0.888377 
      0.517957 
      0.918054 
      0.955116 
      0.895444 
      0.953813 
      ... 
      1.061200 
      0.901934 
      0.873028 
      0.902584 
      1.006995 
      1.003291 
      0.985675 
      0.970812 
      1.017196 
      0.983966 
     
    
      min 
      -4.335000 
      -5.122000 
      -3.420000 
      -3.956000 
      -4.742000 
      -2.182000 
      -4.576000 
      -5.048000 
      -4.692000 
      -12.891000 
      ... 
      -2.912000 
      -4.507000 
      -5.859000 
      -4.053000 
      -4.627000 
      -4.789000 
      -5.695000 
      -2.608000 
      -3.630000 
      -3.044000 
     
    
      25% 
      -0.297000 
      -0.226250 
      -0.313000 
      -0.652250 
      -0.385000 
      -0.853000 
      -0.310000 
      -0.295000 
      -0.159000 
      -0.390000 
      ... 
      -0.664000 
      -0.283000 
      -0.170250 
      -0.407250 
      -0.499000 
      -0.290000 
      -0.202500 
      -0.413000 
      -0.798250 
      -0.350250 
     
    
      50% 
      0.359000 
      0.272500 
      0.386000 
      -0.044500 
      0.110000 
      -0.466000 
      0.388000 
      0.344000 
      0.362000 
      0.042000 
      ... 
      -0.023000 
      0.053500 
      0.299500 
      0.039000 
      -0.040000 
      0.160000 
      0.364000 
      0.137000 
      -0.185500 
      0.313000 
     
    
      75% 
      0.726000 
      0.599000 
      0.918250 
      0.624000 
      0.550250 
      -0.154000 
      0.831250 
      0.782250 
      0.726000 
      0.042000 
      ... 
      0.745250 
      0.488000 
      0.635000 
      0.557000 
      0.462000 
      0.273000 
      0.602000 
      0.644250 
      0.495250 
      0.793250 
     
    
      max 
      2.121000 
      1.918000 
      2.828000 
      2.457000 
      2.689000 
      0.489000 
      1.895000 
      1.918000 
      2.245000 
      1.335000 
      ... 
      4.580000 
      2.689000 
      2.013000 
      2.395000 
      5.465000 
      5.110000 
      2.324000 
      5.238000 
      3.000000 
      2.538000 
     
   
8 rows × 39 columns
 
 
  
    
       
      V0 
      V1 
      V2 
      V3 
      V4 
      V5 
      V6 
      V7 
      V8 
      V9 
      ... 
      V28 
      V29 
      V30 
      V31 
      V32 
      V33 
      V34 
      V35 
      V36 
      V37 
     
   
  
    
      count 
      1925.000000 
      1925.000000 
      1925.000000 
      1925.000000 
      1925.000000 
      1925.000000 
      1925.000000 
      1925.000000 
      1925.000000 
      1925.000000 
      ... 
      1925.000000 
      1925.000000 
      1925.000000 
      1925.000000 
      1925.000000 
      1925.000000 
      1925.000000 
      1925.000000 
      1925.000000 
      1925.000000 
     
    
      mean 
      -0.184404 
      -0.083912 
      -0.434762 
      0.101671 
      -0.019172 
      0.838049 
      -0.274092 
      -0.173971 
      -0.266709 
      0.255114 
      ... 
      -0.206871 
      -0.146463 
      -0.083215 
      -0.191729 
      -0.030782 
      -0.011433 
      -0.009985 
      -0.296895 
      -0.046270 
      0.195735 
     
    
      std 
      1.073333 
      1.076670 
      0.969541 
      1.034925 
      1.147286 
      0.963043 
      1.054119 
      1.040101 
      1.085916 
      1.014394 
      ... 
      1.064140 
      0.880593 
      1.126414 
      1.138454 
      1.130228 
      0.989732 
      0.995213 
      0.946896 
      1.040854 
      0.940599 
     
    
      min 
      -4.814000 
      -5.488000 
      -4.283000 
      -3.276000 
      -4.921000 
      -1.168000 
      -5.649000 
      -5.625000 
      -6.059000 
      -6.784000 
      ... 
      -2.435000 
      -2.413000 
      -4.507000 
      -7.698000 
      -4.057000 
      -4.627000 
      -4.789000 
      -7.477000 
      -2.608000 
      -3.346000 
     
    
      25% 
      -0.664000 
      -0.451000 
      -0.978000 
      -0.644000 
      -0.497000 
      0.122000 
      -0.732000 
      -0.509000 
      -0.775000 
      -0.390000 
      ... 
      -0.453000 
      -0.818000 
      -0.339000 
      -0.476000 
      -0.472000 
      -0.460000 
      -0.290000 
      -0.349000 
      -0.593000 
      -0.432000 
     
    
      50% 
      0.065000 
      0.195000 
      -0.267000 
      0.220000 
      0.118000 
      0.437000 
      -0.082000 
      0.018000 
      -0.004000 
      0.401000 
      ... 
      -0.445000 
      -0.199000 
      0.010000 
      0.100000 
      0.155000 
      -0.040000 
      0.160000 
      -0.270000 
      0.083000 
      0.152000 
     
    
      75% 
      0.549000 
      0.589000 
      0.278000 
      0.793000 
      0.610000 
      1.928000 
      0.457000 
      0.515000 
      0.482000 
      0.904000 
      ... 
      -0.434000 
      0.468000 
      0.447000 
      0.471000 
      0.627000 
      0.419000 
      0.273000 
      0.364000 
      0.651000 
      0.797000 
     
    
      max 
      2.100000 
      2.120000 
      1.946000 
      2.603000 
      4.475000 
      3.176000 
      1.528000 
      1.394000 
      2.408000 
      1.766000 
      ... 
      4.656000 
      3.022000 
      3.139000 
      1.428000 
      2.299000 
      5.465000 
      5.110000 
      1.671000 
      2.861000 
      3.021000 
     
   
8 rows × 38 columns
 
上面数据显示了数据的统计信息,例如样本数,数据的均值mean,标准差std,最小值,最大值等
  查看数据字段信息 
 
  
    
       
      V0 
      V1 
      V2 
      V3 
      V4 
      V5 
      V6 
      V7 
      V8 
      V9 
      ... 
      V29 
      V30 
      V31 
      V32 
      V33 
      V34 
      V35 
      V36 
      V37 
      target 
     
   
  
    
      0 
      0.566 
      0.016 
      -0.143 
      0.407 
      0.452 
      -0.901 
      -1.812 
      -2.360 
      -0.436 
      -2.114 
      ... 
      0.136 
      0.109 
      -0.615 
      0.327 
      -4.627 
      -4.789 
      -5.101 
      -2.608 
      -3.508 
      0.175 
     
    
      1 
      0.968 
      0.437 
      0.066 
      0.566 
      0.194 
      -0.893 
      -1.566 
      -2.360 
      0.332 
      -2.114 
      ... 
      -0.128 
      0.124 
      0.032 
      0.600 
      -0.843 
      0.160 
      0.364 
      -0.335 
      -0.730 
      0.676 
     
    
      2 
      1.013 
      0.568 
      0.235 
      0.370 
      0.112 
      -0.797 
      -1.367 
      -2.360 
      0.396 
      -2.114 
      ... 
      -0.009 
      0.361 
      0.277 
      -0.116 
      -0.843 
      0.160 
      0.364 
      0.765 
      -0.589 
      0.633 
     
    
      3 
      0.733 
      0.368 
      0.283 
      0.165 
      0.599 
      -0.679 
      -1.200 
      -2.086 
      0.403 
      -2.114 
      ... 
      0.015 
      0.417 
      0.279 
      0.603 
      -0.843 
      -0.065 
      0.364 
      0.333 
      -0.112 
      0.206 
     
    
      4 
      0.684 
      0.638 
      0.260 
      0.209 
      0.337 
      -0.454 
      -1.073 
      -2.086 
      0.314 
      -2.114 
      ... 
      0.183 
      1.078 
      0.328 
      0.418 
      -0.843 
      -0.215 
      0.364 
      -0.280 
      -0.028 
      0.384 
     
   
5 rows × 39 columns
 
上面显示训练集前5条数据的基本信息,可以看到数据都是浮点型数据,数据都是数值型连续型特征
 
  
    
       
      V0 
      V1 
      V2 
      V3 
      V4 
      V5 
      V6 
      V7 
      V8 
      V9 
      ... 
      V28 
      V29 
      V30 
      V31 
      V32 
      V33 
      V34 
      V35 
      V36 
      V37 
     
   
  
    
      0 
      0.368 
      0.380 
      -0.225 
      -0.049 
      0.379 
      0.092 
      0.550 
      0.551 
      0.244 
      0.904 
      ... 
      -0.449 
      0.047 
      0.057 
      -0.042 
      0.847 
      0.534 
      -0.009 
      -0.190 
      -0.567 
      0.388 
     
    
      1 
      0.148 
      0.489 
      -0.247 
      -0.049 
      0.122 
      -0.201 
      0.487 
      0.493 
      -0.127 
      0.904 
      ... 
      -0.443 
      0.047 
      0.560 
      0.176 
      0.551 
      0.046 
      -0.220 
      0.008 
      -0.294 
      0.104 
     
    
      2 
      -0.166 
      -0.062 
      -0.311 
      0.046 
      -0.055 
      0.063 
      0.485 
      0.493 
      -0.227 
      0.904 
      ... 
      -0.458 
      -0.398 
      0.101 
      0.199 
      0.634 
      0.017 
      -0.234 
      0.008 
      0.373 
      0.569 
     
    
      3 
      0.102 
      0.294 
      -0.259 
      0.051 
      -0.183 
      0.148 
      0.474 
      0.504 
      0.010 
      0.904 
      ... 
      -0.456 
      -0.398 
      1.007 
      0.137 
      1.042 
      -0.040 
      -0.290 
      0.008 
      -0.666 
      0.391 
     
    
      4 
      0.300 
      0.428 
      0.208 
      0.051 
      -0.033 
      0.116 
      0.408 
      0.497 
      0.155 
      0.904 
      ... 
      -0.458 
      -0.776 
      0.291 
      0.370 
      0.181 
      -0.040 
      -0.290 
      0.008 
      -0.140 
      -0.497 
     
   
5 rows × 38 columns
 
  画箱形图探索数据 
1 2 3 fig = plt.figure(figsize=(4 , 6 ))   sns.boxplot(train_data['V0' ],orient="v" , width=0.5 ) 
 
<AxesSubplot:xlabel='V0'>
 
 
 
1 2 3 4 5 6 7 8 column = train_data.columns.tolist()[:39 ]   fig = plt.figure(figsize=(20 , 40 ))   for  i in  range (38 ):    plt.subplot(13 , 3 , i + 1 )       sns.boxplot(train_data[column[i]], orient="v" , width=0.5 )       plt.ylabel(column[i], fontsize=8 ) plt.show() 
 
 
 
  查看数据分布图 
查看特征变量‘V0’的数据分布直方图,并绘制Q-Q图查看数据是否近似于正态分布
1 2 3 4 5 6 plt.figure(figsize=(10 ,5 )) ax=plt.subplot(1 ,2 ,1 ) sns.distplot(train_data['V0' ],fit=stats.norm) ax=plt.subplot(1 ,2 ,2 ) res = stats.probplot(train_data['V0' ], plot=plt) 
 
 
 
查看查看所有数据的直方图和Q-Q图,查看训练集的数据是否近似于正态分布
1 2 3 4 5 6 7 8 9 10 11 12 13 14 train_cols = 6  train_rows = len (train_data.columns) plt.figure(figsize=(4 *train_cols,4 *train_rows)) i=0  for  col in  train_data.columns:    i+=1      ax=plt.subplot(train_rows,train_cols,i)     sns.distplot(train_data[col],fit=stats.norm)          i+=1      ax=plt.subplot(train_rows,train_cols,i)     res = stats.probplot(train_data[col], plot=plt) plt.show() 
 
 
 
由上面的数据分布图信息可以看出,很多特征变量(如’V1’,‘V9’,‘V24’,'V28’等)的数据分布不是正态的,数据并不跟随对角线,后续可以使用数据变换对数据进行转换。
对比同一特征变量‘V0’下,训练集数据和测试集数据的分布情况,查看数据分布是否一致
1 2 3 4 5 ax = sns.kdeplot(train_data['V0' ], color="Red" , shade=True ) ax = sns.kdeplot(test_data['V0' ], color="Blue" , shade=True ) ax.set_xlabel('V0' ) ax.set_ylabel("Frequency" ) ax = ax.legend(["train" ,"test" ]) 
 
 
 
查看所有特征变量下,训练集数据和测试集数据的分布情况,分析并寻找出数据分布不一致的特征变量。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 dist_cols = 6  dist_rows = len (test_data.columns) plt.figure(figsize=(4 *dist_cols,4 *dist_rows)) i=1  for  col in  test_data.columns:    ax=plt.subplot(dist_rows,dist_cols,i)     ax = sns.kdeplot(train_data[col], color="Red" , shade=True )     ax = sns.kdeplot(test_data[col], color="Blue" , shade=True )     ax.set_xlabel(col)     ax.set_ylabel("Frequency" )     ax = ax.legend(["train" ,"test" ])          i+=1  plt.show() 
 
 
 
查看特征’V5’, ‘V17’, ‘V28’, ‘V22’, ‘V11’, 'V9’数据的数据分布
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 drop_col = 6  drop_row = 1  plt.figure(figsize=(5 *drop_col,5 *drop_row)) i=1  for  col in  ["V5" ,"V9" ,"V11" ,"V17" ,"V22" ,"V28" ]:    ax =plt.subplot(drop_row,drop_col,i)     ax = sns.kdeplot(train_data[col], color="Red" , shade=True )     ax = sns.kdeplot(test_data[col], color="Blue" , shade=True )     ax.set_xlabel(col)     ax.set_ylabel("Frequency" )     ax = ax.legend(["train" ,"test" ])          i+=1  plt.show() 
 
 
 
由上图的数据分布可以看到特征’V5’,‘V9’,‘V11’,‘V17’,‘V22’,‘V28’ 训练集数据与测试集数据分布不一致,会导致模型泛化能力差,采用删除此类特征方法。
1 2 drop_columns = ['V5' ,'V9' ,'V11' ,'V17' ,'V22' ,'V28' ] 
 
  可视化线性回归关系 
查看特征变量‘V0’与’target’变量的线性回归关系
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 fcols = 2  frows = 1  plt.figure(figsize=(8 ,4 )) ax=plt.subplot(1 ,2 ,1 ) sns.regplot(x='V0' , y='target' , data=train_data, ax=ax,              scatter_kws={'marker' :'.' ,'s' :3 ,'alpha' :0.3 },             line_kws={'color' :'k' }); plt.xlabel('V0' ) plt.ylabel('target' ) ax=plt.subplot(1 ,2 ,2 ) sns.distplot(train_data['V0' ].dropna()) plt.xlabel('V0' ) plt.show() 
 
 
 
查看所有特征变量与’target’变量的线性回归关系
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 fcols = 6  frows = len (test_data.columns) plt.figure(figsize=(5 *fcols,4 *frows)) i=0  for  col in  test_data.columns:    i+=1      ax=plt.subplot(frows,fcols,i)     sns.regplot(x=col, y='target' , data=train_data, ax=ax,                  scatter_kws={'marker' :'.' ,'s' :3 ,'alpha' :0.3 },                 line_kws={'color' :'k' });     plt.xlabel(col)     plt.ylabel('target' )          i+=1      ax=plt.subplot(frows,fcols,i)     sns.distplot(train_data[col].dropna())     plt.xlabel(col) 
 
 
 
  查看特征变量的相关性 
1 2 3 data_train1 = train_data.drop(['V5' ,'V9' ,'V11' ,'V17' ,'V22' ,'V28' ],axis=1 ) train_corr = data_train1.corr() train_corr 
 
  
    
       
      V0 
      V1 
      V2 
      V3 
      V4 
      V6 
      V7 
      V8 
      V10 
      V12 
      ... 
      V29 
      V30 
      V31 
      V32 
      V33 
      V34 
      V35 
      V36 
      V37 
      target 
     
   
  
    
      V0 
      1.000000 
      0.908607 
      0.463643 
      0.409576 
      0.781212 
      0.189267 
      0.141294 
      0.794013 
      0.298443 
      0.751830 
      ... 
      0.302145 
      0.156968 
      0.675003 
      0.050951 
      0.056439 
      -0.019342 
      0.138933 
      0.231417 
      -0.494076 
      0.873212 
     
    
      V1 
      0.908607 
      1.000000 
      0.506514 
      0.383924 
      0.657790 
      0.276805 
      0.205023 
      0.874650 
      0.310120 
      0.656186 
      ... 
      0.147096 
      0.175997 
      0.769745 
      0.085604 
      0.035129 
      -0.029115 
      0.146329 
      0.235299 
      -0.494043 
      0.871846 
     
    
      V2 
      0.463643 
      0.506514 
      1.000000 
      0.410148 
      0.057697 
      0.615938 
      0.477114 
      0.703431 
      0.346006 
      0.059941 
      ... 
      -0.275764 
      0.175943 
      0.653764 
      0.033942 
      0.050309 
      -0.025620 
      0.043648 
      0.316462 
      -0.734956 
      0.638878 
     
    
      V3 
      0.409576 
      0.383924 
      0.410148 
      1.000000 
      0.315046 
      0.233896 
      0.197836 
      0.411946 
      0.321262 
      0.306397 
      ... 
      0.117610 
      0.043966 
      0.421954 
      -0.092423 
      -0.007159 
      -0.031898 
      0.080034 
      0.324475 
      -0.229613 
      0.512074 
     
    
      V4 
      0.781212 
      0.657790 
      0.057697 
      0.315046 
      1.000000 
      -0.117529 
      -0.052370 
      0.449542 
      0.141129 
      0.927685 
      ... 
      0.659093 
      0.022807 
      0.447016 
      -0.026186 
      0.062367 
      0.028659 
      0.100010 
      0.113609 
      -0.031054 
      0.603984 
     
    
      V6 
      0.189267 
      0.276805 
      0.615938 
      0.233896 
      -0.117529 
      1.000000 
      0.917502 
      0.468233 
      0.415660 
      -0.087312 
      ... 
      -0.467980 
      0.188907 
      0.546535 
      0.144550 
      0.054210 
      -0.002914 
      0.044992 
      0.433804 
      -0.404817 
      0.370037 
     
    
      V7 
      0.141294 
      0.205023 
      0.477114 
      0.197836 
      -0.052370 
      0.917502 
      1.000000 
      0.389987 
      0.310982 
      -0.036791 
      ... 
      -0.311363 
      0.170113 
      0.475254 
      0.122707 
      0.034508 
      -0.019103 
      0.111166 
      0.340479 
      -0.292285 
      0.287815 
     
    
      V8 
      0.794013 
      0.874650 
      0.703431 
      0.411946 
      0.449542 
      0.468233 
      0.389987 
      1.000000 
      0.419703 
      0.420557 
      ... 
      -0.011091 
      0.150258 
      0.878072 
      0.038430 
      0.026843 
      -0.036297 
      0.179167 
      0.326586 
      -0.553121 
      0.831904 
     
    
      V10 
      0.298443 
      0.310120 
      0.346006 
      0.321262 
      0.141129 
      0.415660 
      0.310982 
      0.419703 
      1.000000 
      0.140462 
      ... 
      -0.105042 
      -0.036705 
      0.560213 
      -0.093213 
      0.016739 
      -0.026994 
      0.026846 
      0.922190 
      -0.045851 
      0.394767 
     
    
      V12 
      0.751830 
      0.656186 
      0.059941 
      0.306397 
      0.927685 
      -0.087312 
      -0.036791 
      0.420557 
      0.140462 
      1.000000 
      ... 
      0.666775 
      0.028866 
      0.441963 
      -0.007658 
      0.046674 
      0.010122 
      0.081963 
      0.112150 
      -0.054827 
      0.594189 
     
    
      V13 
      0.185144 
      0.157518 
      0.204762 
      -0.003636 
      0.075993 
      0.138367 
      0.110973 
      0.153299 
      -0.059553 
      0.098771 
      ... 
      0.008235 
      0.027328 
      0.113743 
      0.130598 
      0.157513 
      0.116944 
      0.219906 
      -0.024751 
      -0.379714 
      0.203373 
     
    
      V14 
      -0.004144 
      -0.006268 
      -0.106282 
      -0.232677 
      0.023853 
      0.072911 
      0.163931 
      0.008138 
      -0.077543 
      0.020069 
      ... 
      0.056814 
      -0.004057 
      0.010989 
      0.106581 
      0.073535 
      0.043218 
      0.233523 
      -0.086217 
      0.010553 
      0.008424 
     
    
      V15 
      0.314520 
      0.164702 
      -0.224573 
      0.143457 
      0.615704 
      -0.431542 
      -0.291272 
      0.018366 
      -0.046737 
      0.642081 
      ... 
      0.951314 
      -0.111311 
      0.011768 
      -0.104618 
      0.050254 
      0.048602 
      0.100817 
      -0.051861 
      0.245635 
      0.154020 
     
    
      V16 
      0.347357 
      0.435606 
      0.782474 
      0.394517 
      0.023818 
      0.847119 
      0.752683 
      0.680031 
      0.546975 
      0.025736 
      ... 
      -0.342210 
      0.154794 
      0.778538 
      0.041474 
      0.028878 
      -0.054775 
      0.082293 
      0.551880 
      -0.420053 
      0.536748 
     
    
      V18 
      0.148622 
      0.123862 
      0.132105 
      0.022868 
      0.136022 
      0.110570 
      0.098691 
      0.093682 
      -0.024693 
      0.119833 
      ... 
      0.053958 
      0.470341 
      0.079718 
      0.411967 
      0.512139 
      0.365410 
      0.152088 
      0.019603 
      -0.181937 
      0.170721 
     
    
      V19 
      -0.100294 
      -0.092673 
      -0.161802 
      -0.246008 
      -0.205729 
      0.215290 
      0.158371 
      -0.144693 
      0.074903 
      -0.148319 
      ... 
      -0.205409 
      0.100133 
      -0.131542 
      0.144018 
      -0.021517 
      -0.079753 
      -0.220737 
      0.087605 
      0.012115 
      -0.114976 
     
    
      V20 
      0.462493 
      0.459795 
      0.298385 
      0.289594 
      0.291309 
      0.136091 
      0.089399 
      0.412868 
      0.207612 
      0.271559 
      ... 
      0.016233 
      0.086165 
      0.326863 
      0.050699 
      0.009358 
      -0.000979 
      0.048981 
      0.161315 
      -0.322006 
      0.444965 
     
    
      V21 
      -0.029285 
      -0.012911 
      -0.030932 
      0.114373 
      0.174025 
      -0.051806 
      -0.065300 
      -0.047839 
      0.082288 
      0.144371 
      ... 
      0.157097 
      -0.077945 
      0.053025 
      -0.159128 
      -0.087561 
      -0.053707 
      -0.199398 
      0.047340 
      0.315470 
      -0.010063 
     
    
      V23 
      0.231136 
      0.222574 
      0.065509 
      0.081374 
      0.196530 
      0.069901 
      0.125180 
      0.174124 
      -0.066537 
      0.180049 
      ... 
      0.116122 
      0.363963 
      0.129783 
      0.367086 
      0.183666 
      0.196681 
      0.635252 
      -0.035949 
      -0.187582 
      0.226331 
     
    
      V24 
      -0.324959 
      -0.233556 
      0.010225 
      -0.237326 
      -0.529866 
      0.072418 
      -0.030292 
      -0.136898 
      -0.029420 
      -0.550881 
      ... 
      -0.642370 
      0.033532 
      -0.202097 
      0.060608 
      -0.134320 
      -0.095588 
      -0.243738 
      -0.041325 
      -0.137614 
      -0.264815 
     
    
      V25 
      -0.200706 
      -0.070627 
      0.481785 
      -0.100569 
      -0.444375 
      0.438610 
      0.316744 
      0.173320 
      0.079805 
      -0.448877 
      ... 
      -0.575154 
      0.088238 
      0.201243 
      0.065501 
      -0.013312 
      -0.030747 
      -0.093948 
      0.069302 
      -0.246742 
      -0.019373 
     
    
      V26 
      -0.125140 
      -0.043012 
      0.035370 
      -0.027685 
      -0.080487 
      0.106055 
      0.160566 
      0.015724 
      0.072366 
      -0.124111 
      ... 
      -0.133694 
      -0.057247 
      0.062879 
      -0.004545 
      -0.034596 
      0.051294 
      0.085576 
      0.064963 
      0.010880 
      -0.046724 
     
    
      V27 
      0.733198 
      0.824198 
      0.726250 
      0.392006 
      0.412083 
      0.474441 
      0.424185 
      0.901100 
      0.246085 
      0.374380 
      ... 
      -0.032772 
      0.208074 
      0.790239 
      0.095127 
      0.030135 
      -0.036123 
      0.159884 
      0.226713 
      -0.617771 
      0.812585 
     
    
      V29 
      0.302145 
      0.147096 
      -0.275764 
      0.117610 
      0.659093 
      -0.467980 
      -0.311363 
      -0.011091 
      -0.105042 
      0.666775 
      ... 
      1.000000 
      -0.122817 
      -0.004364 
      -0.110699 
      0.035272 
      0.035392 
      0.078588 
      -0.099309 
      0.285581 
      0.123329 
     
    
      V30 
      0.156968 
      0.175997 
      0.175943 
      0.043966 
      0.022807 
      0.188907 
      0.170113 
      0.150258 
      -0.036705 
      0.028866 
      ... 
      -0.122817 
      1.000000 
      0.114318 
      0.695725 
      0.083693 
      -0.028573 
      -0.027987 
      0.006961 
      -0.256814 
      0.187311 
     
    
      V31 
      0.675003 
      0.769745 
      0.653764 
      0.421954 
      0.447016 
      0.546535 
      0.475254 
      0.878072 
      0.560213 
      0.441963 
      ... 
      -0.004364 
      0.114318 
      1.000000 
      0.016782 
      0.016733 
      -0.047273 
      0.152314 
      0.510851 
      -0.357785 
      0.750297 
     
    
      V32 
      0.050951 
      0.085604 
      0.033942 
      -0.092423 
      -0.026186 
      0.144550 
      0.122707 
      0.038430 
      -0.093213 
      -0.007658 
      ... 
      -0.110699 
      0.695725 
      0.016782 
      1.000000 
      0.105255 
      0.069300 
      0.016901 
      -0.054411 
      -0.162417 
      0.066606 
     
    
      V33 
      0.056439 
      0.035129 
      0.050309 
      -0.007159 
      0.062367 
      0.054210 
      0.034508 
      0.026843 
      0.016739 
      0.046674 
      ... 
      0.035272 
      0.083693 
      0.016733 
      0.105255 
      1.000000 
      0.719126 
      0.167597 
      0.031586 
      -0.062715 
      0.077273 
     
    
      V34 
      -0.019342 
      -0.029115 
      -0.025620 
      -0.031898 
      0.028659 
      -0.002914 
      -0.019103 
      -0.036297 
      -0.026994 
      0.010122 
      ... 
      0.035392 
      -0.028573 
      -0.047273 
      0.069300 
      0.719126 
      1.000000 
      0.233616 
      -0.019032 
      -0.006854 
      -0.006034 
     
    
      V35 
      0.138933 
      0.146329 
      0.043648 
      0.080034 
      0.100010 
      0.044992 
      0.111166 
      0.179167 
      0.026846 
      0.081963 
      ... 
      0.078588 
      -0.027987 
      0.152314 
      0.016901 
      0.167597 
      0.233616 
      1.000000 
      0.025401 
      -0.077991 
      0.140294 
     
    
      V36 
      0.231417 
      0.235299 
      0.316462 
      0.324475 
      0.113609 
      0.433804 
      0.340479 
      0.326586 
      0.922190 
      0.112150 
      ... 
      -0.099309 
      0.006961 
      0.510851 
      -0.054411 
      0.031586 
      -0.019032 
      0.025401 
      1.000000 
      -0.039478 
      0.319309 
     
    
      V37 
      -0.494076 
      -0.494043 
      -0.734956 
      -0.229613 
      -0.031054 
      -0.404817 
      -0.292285 
      -0.553121 
      -0.045851 
      -0.054827 
      ... 
      0.285581 
      -0.256814 
      -0.357785 
      -0.162417 
      -0.062715 
      -0.006854 
      -0.077991 
      -0.039478 
      1.000000 
      -0.565795 
     
    
      target 
      0.873212 
      0.871846 
      0.638878 
      0.512074 
      0.603984 
      0.370037 
      0.287815 
      0.831904 
      0.394767 
      0.594189 
      ... 
      0.123329 
      0.187311 
      0.750297 
      0.066606 
      0.077273 
      -0.006034 
      0.140294 
      0.319309 
      -0.565795 
      1.000000 
     
   
33 rows × 33 columns
 
1 2 3 4 5 ax = plt.subplots(figsize=(20 , 16 )) ax = sns.heatmap(train_corr, vmax=.8 , square=True , annot=True ) 
 
 
 
1 2 3 4 5 6 7 8 9 10 11 data_train1 = train_data.drop(['V5' ,'V9' ,'V11' ,'V17' ,'V22' ,'V28' ],axis=1 ) plt.figure(figsize=(20 , 16 ))   colnm = data_train1.columns.tolist()   mcorr = data_train1[colnm].corr(method="spearman" )   mask = np.zeros_like(mcorr, dtype=np.bool )   mask[np.triu_indices_from(mask)] = True    cmap = sns.diverging_palette(220 , 10 , as_cmap=True )   g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True , annot=True , fmt='0.2f' )   plt.show() 
 
 
 
上图为所有特征变量和target变量两两之间的相关系数,由此可以看出各个特征变量V0-V37之间的相关性以及特征变量V0-V37与target的相关性。
  查找出特征变量和target变量相关系数大于0.5的特征变量 
1 2 3 4 5 6 7 8 9 10 11 k = 10   cols = train_corr.nlargest(k, 'target' )['target' ].index cm = np.corrcoef(train_data[cols].values.T) hm = plt.subplots(figsize=(10 , 10 )) hm = sns.heatmap(train_data[cols].corr(),annot=True ,square=True ) plt.show() 
 
 
 
1 2 3 4 5 6 7 threshold = 0.5  corrmat = train_data.corr() top_corr_features = corrmat.index[abs (corrmat["target" ])>threshold] plt.figure(figsize=(10 ,10 )) g = sns.heatmap(train_data[top_corr_features].corr(),annot=True ,cmap="RdYlGn" ) 
 
 
 
1 2 drop_columns.clear() drop_columns = ['V5' ,'V9' ,'V11' ,'V17' ,'V22' ,'V28' ] 
 
1 2 3 4 5 6 7 threshold = 0.5  corr_matrix = data_train1.corr().abs () drop_col=corr_matrix[corr_matrix["target" ]<threshold].index 
 
由于’V14’, ‘V21’, ‘V25’, ‘V26’, ‘V32’, ‘V33’, 'V34’特征的相关系数值小于0.5,故认为这些特征与最终的预测target值不相关,删除这些特征变量;
1 2 3 4 5 6 7 8 9 10 train_x =  train_data.drop(['target' ], axis=1 ) data_all = pd.concat([train_x,test_data])  data_all.drop(drop_columns,axis=1 ,inplace=True ) data_all.head() 
 
  
    
       
      V0 
      V1 
      V2 
      V3 
      V4 
      V6 
      V7 
      V8 
      V10 
      V12 
      ... 
      V27 
      V29 
      V30 
      V31 
      V32 
      V33 
      V34 
      V35 
      V36 
      V37 
     
   
  
    
      0 
      0.566 
      0.016 
      -0.143 
      0.407 
      0.452 
      -1.812 
      -2.360 
      -0.436 
      -0.940 
      -0.073 
      ... 
      0.168 
      0.136 
      0.109 
      -0.615 
      0.327 
      -4.627 
      -4.789 
      -5.101 
      -2.608 
      -3.508 
     
    
      1 
      0.968 
      0.437 
      0.066 
      0.566 
      0.194 
      -1.566 
      -2.360 
      0.332 
      0.188 
      -0.134 
      ... 
      0.338 
      -0.128 
      0.124 
      0.032 
      0.600 
      -0.843 
      0.160 
      0.364 
      -0.335 
      -0.730 
     
    
      2 
      1.013 
      0.568 
      0.235 
      0.370 
      0.112 
      -1.367 
      -2.360 
      0.396 
      0.874 
      -0.072 
      ... 
      0.326 
      -0.009 
      0.361 
      0.277 
      -0.116 
      -0.843 
      0.160 
      0.364 
      0.765 
      -0.589 
     
    
      3 
      0.733 
      0.368 
      0.283 
      0.165 
      0.599 
      -1.200 
      -2.086 
      0.403 
      0.011 
      -0.014 
      ... 
      0.277 
      0.015 
      0.417 
      0.279 
      0.603 
      -0.843 
      -0.065 
      0.364 
      0.333 
      -0.112 
     
    
      4 
      0.684 
      0.638 
      0.260 
      0.209 
      0.337 
      -1.073 
      -2.086 
      0.314 
      -0.251 
      0.199 
      ... 
      0.332 
      0.183 
      1.078 
      0.328 
      0.418 
      -0.843 
      -0.215 
      0.364 
      -0.280 
      -0.028 
     
   
5 rows × 32 columns
 
1 2 3 4 5 6 7 8 cols_numeric=list (data_all.columns) def  scale_minmax (col ):    return  (col-col.min ())/(col.max ()-col.min ()) data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax,axis=0 ) data_all[cols_numeric].describe() 
 
  
    
       
      V0 
      V1 
      V2 
      V3 
      V4 
      V6 
      V7 
      V8 
      V10 
      V12 
      ... 
      V27 
      V29 
      V30 
      V31 
      V32 
      V33 
      V34 
      V35 
      V36 
      V37 
     
   
  
    
      count 
      4813.000000 
      4813.000000 
      4813.000000 
      4813.000000 
      4813.000000 
      4813.000000 
      4813.000000 
      4813.000000 
      4813.000000 
      4813.000000 
      ... 
      4813.000000 
      4813.000000 
      4813.000000 
      4813.000000 
      4813.000000 
      4813.000000 
      4813.000000 
      4813.000000 
      4813.000000 
      4813.000000 
     
    
      mean 
      0.694172 
      0.721357 
      0.602300 
      0.603139 
      0.523743 
      0.748823 
      0.745740 
      0.715607 
      0.348518 
      0.578507 
      ... 
      0.881401 
      0.388683 
      0.589459 
      0.792709 
      0.628824 
      0.458493 
      0.483790 
      0.762873 
      0.332385 
      0.545795 
     
    
      std 
      0.144198 
      0.131443 
      0.140628 
      0.152462 
      0.106430 
      0.132560 
      0.132577 
      0.118105 
      0.134882 
      0.105088 
      ... 
      0.128221 
      0.133475 
      0.130786 
      0.102976 
      0.155003 
      0.099095 
      0.101020 
      0.102037 
      0.127456 
      0.150356 
     
    
      min 
      0.000000 
      0.000000 
      0.000000 
      0.000000 
      0.000000 
      0.000000 
      0.000000 
      0.000000 
      0.000000 
      0.000000 
      ... 
      0.000000 
      0.000000 
      0.000000 
      0.000000 
      0.000000 
      0.000000 
      0.000000 
      0.000000 
      0.000000 
      0.000000 
     
    
      25% 
      0.626676 
      0.679416 
      0.514414 
      0.503888 
      0.478182 
      0.683324 
      0.696938 
      0.664934 
      0.284327 
      0.532892 
      ... 
      0.888575 
      0.292445 
      0.550092 
      0.761816 
      0.562461 
      0.409037 
      0.454490 
      0.727273 
      0.270584 
      0.445647 
     
    
      50% 
      0.729488 
      0.752497 
      0.617072 
      0.614270 
      0.535866 
      0.774125 
      0.771974 
      0.742884 
      0.366469 
      0.591635 
      ... 
      0.916015 
      0.375734 
      0.594428 
      0.815055 
      0.643056 
      0.454518 
      0.499949 
      0.800020 
      0.347056 
      0.539317 
     
    
      75% 
      0.790195 
      0.799553 
      0.700464 
      0.710474 
      0.585036 
      0.842259 
      0.836405 
      0.790835 
      0.432965 
      0.641971 
      ... 
      0.932555 
      0.471837 
      0.650798 
      0.852229 
      0.719777 
      0.500000 
      0.511365 
      0.800020 
      0.414861 
      0.643061 
     
    
      max 
      1.000000 
      1.000000 
      1.000000 
      1.000000 
      1.000000 
      1.000000 
      1.000000 
      1.000000 
      1.000000 
      1.000000 
      ... 
      1.000000 
      1.000000 
      1.000000 
      1.000000 
      1.000000 
      1.000000 
      1.000000 
      1.000000 
      1.000000 
      1.000000 
     
   
8 rows × 32 columns
 
1 2 3 4 5 6 7 train_data_process = train_data[cols_numeric] train_data_process = train_data_process[cols_numeric].apply(scale_minmax,axis=0 ) test_data_process = test_data[cols_numeric] test_data_process = test_data_process[cols_numeric].apply(scale_minmax,axis=0 ) 
 
1 2 cols_numeric_left = cols_numeric[0 :13 ] cols_numeric_right = cols_numeric[13 :] 
 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 train_data_process = pd.concat([train_data_process, train_data['target' ]], axis=1 ) fcols = 6  frows = len (cols_numeric_left) plt.figure(figsize=(4 *fcols,4 *frows)) i=0  for  var in  cols_numeric_left:    dat = train_data_process[[var, 'target' ]].dropna()              i+=1      plt.subplot(frows,fcols,i)     sns.distplot(dat[var] , fit=stats.norm);     plt.title(var+' Original' )     plt.xlabel('' )              i+=1      plt.subplot(frows,fcols,i)     _=stats.probplot(dat[var], plot=plt)     plt.title('skew=' +'{:.4f}' .format (stats.skew(dat[var])))     plt.xlabel('' )     plt.ylabel('' )              i+=1      plt.subplot(frows,fcols,i)     plt.plot(dat[var], dat['target' ],'.' ,alpha=0.5 )     plt.title('corr=' +'{:.2f}' .format (np.corrcoef(dat[var], dat['target' ])[0 ][1 ]))       i+=1      plt.subplot(frows,fcols,i)     trans_var, lambda_var = stats.boxcox(dat[var].dropna()+1 )     trans_var = scale_minmax(trans_var)           sns.distplot(trans_var , fit=stats.norm);     plt.title(var+' Tramsformed' )     plt.xlabel('' )              i+=1      plt.subplot(frows,fcols,i)     _=stats.probplot(trans_var, plot=plt)     plt.title('skew=' +'{:.4f}' .format (stats.skew(trans_var)))     plt.xlabel('' )     plt.ylabel('' )              i+=1      plt.subplot(frows,fcols,i)     plt.plot(trans_var, dat['target' ],'.' ,alpha=0.5 )     plt.title('corr=' +'{:.2f}' .format (np.corrcoef(trans_var,dat['target' ])[0 ][1 ])) 
 
 
 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 fcols = 6  frows = len (cols_numeric_right) plt.figure(figsize=(4 *fcols,4 *frows)) i=0  for  var in  cols_numeric_right:    dat = train_data_process[[var, 'target' ]].dropna()              i+=1      plt.subplot(frows,fcols,i)     sns.distplot(dat[var] , fit=stats.norm);     plt.title(var+' Original' )     plt.xlabel('' )              i+=1      plt.subplot(frows,fcols,i)     _=stats.probplot(dat[var], plot=plt)     plt.title('skew=' +'{:.4f}' .format (stats.skew(dat[var])))     plt.xlabel('' )     plt.ylabel('' )              i+=1      plt.subplot(frows,fcols,i)     plt.plot(dat[var], dat['target' ],'.' ,alpha=0.5 )     plt.title('corr=' +'{:.2f}' .format (np.corrcoef(dat[var], dat['target' ])[0 ][1 ]))       i+=1      plt.subplot(frows,fcols,i)     trans_var, lambda_var = stats.boxcox(dat[var].dropna()+1 )     trans_var = scale_minmax(trans_var)           sns.distplot(trans_var , fit=stats.norm);     plt.title(var+' Tramsformed' )     plt.xlabel('' )              i+=1      plt.subplot(frows,fcols,i)     _=stats.probplot(trans_var, plot=plt)     plt.title('skew=' +'{:.4f}' .format (stats.skew(trans_var)))     plt.xlabel('' )     plt.ylabel('' )              i+=1      plt.subplot(frows,fcols,i)     plt.plot(trans_var, dat['target' ],'.' ,alpha=0.5 )     plt.title('corr=' +'{:.2f}' .format (np.corrcoef(trans_var,dat['target' ])[0 ][1 ]))