特征工程与数据预处理全解析:基础技术和代码示例
新机器视觉
共 12090字,需浏览 25分钟
·
2024-07-06 21:56
转载自:Imagination Tech
def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75): quartile1 = dataframe[col_name].quantile(q1) quartile3 = dataframe[col_name].quantile(q3) interquantile_range = quartile3 - quartile1 up_limit = quartile3 + 1.5 * interquantile_range low_limit = quartile1 - 1.5 * interquantile_rangereturn low_limit, up_limitdef check_outlier(dataframe, col_name): low_limit, up_limit = outlier_thresholds(dataframe, col_name)if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):return Trueelse:return False
def remove_outlier(dataframe, col_name): low_limit, up_limit = outlier_thresholds(dataframe, col_name) df_without_outliers = dataframe[~((dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit))] return df_without_outliers
1.2.2 带阈值的重新分配
def replace_with_thresholds(dataframe, variable): low_limit, up_limit = outlier_thresholds(dataframe, variable) dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit
from sklearn.neighbors import LocalOutlierFactor def detect_outliers_lof(data, n_neighbors=20): lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination='auto') outlier_labels = lof.fit_predict(data) return outlier_labels == -1 # True for outliers, False for inliers
缺失值 缺失值是现实世界数据集中常见的问题,处理丢失数据时要考虑的一个重要问题是丢失数据的随机性。
def missing_values_table(dataframe, na_name=False): na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0] n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False) ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False) missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio']) print(missing_df, end="\n") if na_name: return na_columns
2.1 缺失值处理
def remove_missing(df, threshold=0.7): return df.dropna(thresh=int(threshold*len(df)), axis=1).dropna()
def simple_impute(dataframe): cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"] num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtypes != "O"] cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtypes == "O"] cat_cols = cat_cols + num_but_cat cat_cols = [col for col in cat_cols if col not in cat_but_car] num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"] num_cols = [col for col in num_cols if col not in num_but_cat] df[num_cols] = df[num_cols].fillna(df[num_cols].median()) df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0]) return df
def categorical_impute(df, col_1, col_2, method="mean"): df[col_1].fillna(df.groupby(col_2)[col_1].transform(method)) return df
from sklearn.impute import KNNImputer def knn_impute(dataframe, n_neighbors=5): cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"] num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtypes != "O"] cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtypes == "O"] cat_cols = cat_cols + num_but_cat cat_cols = [col for col in cat_cols if col not in cat_but_car] num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"] num_cols = [col for col in num_cols if col not in num_but_cat] df = pd.get_dummies(dataframe[cat_cols + num_cols], drop_first=True) # Standardization of Variables scaler = MinMaxScaler() df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns) df.head() # Implementation of KNN imputer = KNNImputer(n_neighbors=n_neighbors) return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
编码
from sklearn.preprocessing import LabelEncoder def label_encoder(dataframe, binary_col): labelencoder = LabelEncoder() dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col]) return dataframe binary_cols = [col for col in df.columns if df[col].dtype not in [int, float] and df[col].nunique() == 2] for col in binary_cols: label_encoder(df, col)
def one_hot_encoder(dataframe, categorical_cols, drop_first=True): dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first) return dataframe ohe_cols = [col for col in df.columns if 10 >= df[col].nunique() > 2] one_hot_encoder(df, ohe_cols).head()
cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"] num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtypes != "O"] cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtypes == "O"] cat_cols = cat_cols + num_but_cat cat_cols = [col for col in cat_cols if col not in cat_but_car] def rare_analyser(dataframe, target, cat_cols): for col in cat_cols: print(col, ":", len(dataframe[col].value_counts())) print(pd.DataFrame({"COUNT": dataframe[col].value_counts(), "RATIO": dataframe[col].value_counts() / len(dataframe), "TARGET_MEAN": dataframe.groupby(col)[target].mean()}), end="\n\n\n") rare_analyser(df, "TARGET", cat_cols) def rare_encoder(dataframe, rare_perc): temp_df = dataframe.copy() rare_columns = [col for col in temp_df.columns if temp_df[col].dtypes == 'O' and (temp_df[col].value_counts() / len(temp_df) < rare_perc).any(axis=None)] for var in rare_columns: tmp = temp_df[var].value_counts() / len(temp_df) rare_labels = tmp[tmp < rare_perc].index temp_df[var] = np.where(temp_df[var].isin(rare_labels), 'Rare', temp_df[var]) return temp_df new_df = rare_encoder(df, 0.01)
from sklearn.preprocessing import StandardScaler def standard_scale(df, columns): scaler = StandardScaler() df[columns] = scaler.fit_transform(df[columns]) return df
4.2 Robust Scaling
from sklearn.preprocessing import RobustScaler def robust_scale(df, columns): scaler = RobustScaler() df[columns] = scaler.fit_transform(df[columns]) return df
4.3 Min-Max Scaling
from sklearn.preprocessing import MinMaxScaler def minmax_scale(df, columns): scaler = MinMaxScaler() df[columns] = scaler.fit_transform(df[columns]) return df
import numpy as np def binning(df, column, bins, labels=None): df[f'{column}_binned'] = pd.qcut(df[column], bins=bins, labels=labels) return df
def create_binary_feature(df, column, condition): df[f'{column}_flag'] = np.where(condition(df[column]), 1, 0) return df
# Letter Count df["NEW_NAME_COUNT"] = df["Name"].str.len() # Word Count df["NEW_NAME_WORD_COUNT"] = df["Name"].apply(lambda x: len(str(x).split(" "))) # Capturing Special Structures df["NEW_NAME_DR"] = df["Name"].apply(lambda x: len([x for x in x.split() if x.startswith("Dr")])) df.groupby("NEW_NAME_DR").agg({"Survived": ["mean","count"]}) # Deriving Variables with Regex df['NEW_TITLE'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False) df[["NEW_TITLE", "Survived", "AGE"]].groupby(["NEW_TITLE"]).agg({"Survived": "mean", "AGE": ["count", "mean"]})
def date_features(df, date_column): df[f'{date_column}_year'] = df[date_column].dt.year df[f'{date_column}_month'] = df[date_column].dt.month df[f'{date_column}_day'] = df[date_column].dt.day df[f'{date_column}_dayofweek'] = df[date_column].dt.dayofweek return df
评论