一、数据预处理 1 2 3 4 5 6 import pandas as pddf=pd.read_csv('file_name.csv' ) df=pd.read_excel('path' , sheetname = 'sheet1' , header = 0 , names = ['第一列' ,'第二列' ,'第三列' ]) data = pd.read_csv('path' ,sep = ',' , header = 0 , names = ['第一列' ,'第二列' ,'第三列' ], encoding = 'utf-8' ) data = pd.read_table('path' , sep = '\t' , header = None , names = ['第一列' ,'第二列' ,'第三列' ])
数据清洗: 删除重复: 1 2 3 df=df.drop_duplicates() df=df.dropna() df=df.dropna(axis=1 )
填充丢失值: 1 2 3 4 5 df=df.fillna(df.mean()) df=df.fillna(df.median()) df=df.fillna(df.mode().iloc[0 ]) df=df.fillna(method='ffill' ) df=df.fillna(method='bfill' )
数据转换: 归一化&标准化: 调整特征的数值范围[0,1],或者正态分布(均值0,标准差1)
1 2 3 4 5 6 7 8 9 10 from sklearn.preprocessing import MinMaxScaler, StandardScalerdata={'ages' =[20 ,30 ,40 ,50 ]} df=pd.DataFrame(data) min_max_scaler=MinMaxScaler() standard_sacler=StandardScaler() df['ages_normalized' ]=min_max_scaler.fit_transformer(df[['ages' ]]) df['ages_standardized' ]=standard_sacler.fit_transformer(df[['ages' ]])
独热编码: 分类变量→数字
1 2 3 4 data={'颜色' :['红色' ,'绿色' ,'蓝色' ,'红色' ,'绿色' ,'蓝色' ]} df=pd.DataFrame(data) df=pd.get_dummies(df,columns=['颜色' ])
读取文件目录: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 import osimport numpy as np base_path="TIMI/TEST/" with open ("test.scp" ,'wt' ,encoding='utf-8' ) as f: for root,dirs,files in os.walk(base_path): for file in files: file_name=os.path.join(root,file) if file_name.endswitch(".WAV" ): print (file_name) f.write("%s\n" %file_name)