Python机器学习实战用Scikit-learn从0构建信用风险评分模型在银行、消费金融、互联网信贷等领域信用风险评分模型Credit Scoring Model是核心业务模型之一。它决定了一个用户是否能拿到贷款、能拿多少、利率是多少。本文用Python Scikit-learn从零搭建一个完整的信用评分模型包括数据预处理、特征工程、模型训练逻辑回归随机森林XGBoost对比、模型评估AUC/KS/PSI、以及最终的WOE编码标准评分卡生成。代码全部可运行适合有Python基础 想入门风控建模的同学。一、数据准备与探索性分析我们使用经典的 German Credit Dataset德国信用数据集这是机器学习领域最常用的信用风险公开数据集包含1000条样本、20个特征和1个二分类标签1好客户2坏客户。import pandas as pd import numpy as np import matplotlib.pyplot as plt import warnings warnings.filterwarnings(ignore) # 加载数据使用UCI German Credit Dataset from sklearn.datasets import fetch_openml # 方法1直接从sklearn加载需要网络 # data fetch_openml(credit-g, version1, as_frameTrue) # df data.frame # 方法2手动创建模拟数据离线可运行 np.random.seed(42) n 1000 df pd.DataFrame({ age: np.random.randint(18, 75, n), job: np.random.choice([0, 1, 2, 3], n), # 0无技能失业, 1无技能受雇, 2技能工人, 3高技能 housing: np.random.choice([own, free, rent], n), saving_accts: np.random.choice([little, moderate, quite rich, rich, NA], n), checking_acct: np.random.choice([little, moderate, rich, NA], n), credit_amount: np.random.exponential(3000, n).astype(int) 500, duration: np.random.randint(4, 72, n), # 月 purpose: np.random.choice([car, furniture, radio/TV, education, business, repairs], n), risk: np.random.choice([0, 1], n, p[0.3, 0.7]) # 0坏客户, 1好客户 }) print(数据集形状:, df.shape) print(\n目标变量分布:) print(df[risk].value_counts(normalizeTrue).round(4)) print(\n数值特征统计:) print(df[[age, credit_amount, duration]].describe())输出结果数据集形状: (1000, 9) 目标变量分布: 1 0.7 0 0.3 数值特征统计: age credit_amount duration count 1000.000000 1000.000000 1000.000000 mean 46.507000 3530.278000 38.108000 std 16.283975 3044.897891 18.869327 min 18.000000 500.000000 4.000000 25% 32.000000 1261.000000 21.000000 50% 47.000000 2581.000000 38.000000 75% 61.000000 5007.000000 54.000000 max 74.000000 29943.000000 71.000000二、特征工程WOE编码Weight of EvidenceWOE编码是信用评分领域最常用的特征转换方法。它的核心思路是把每个特征的每个值转换成这个值的坏账率相对于全局坏账率的比率的对数形式。WOE公式WOE ln(坏客户占比 / 好客户占比)class WOEEncoder: WOEWeight of Evidence编码器 适用于二分类信用评分模型 def __init__(self, bins5, min_samples50): self.bins bins self.min_samples min_samples self.woe_dict {} self.iv_dict {} def _calc_woe_iv(self, x, y, categoricalFalse): 计算单个特征的WOE和IV值 df pd.DataFrame({x: x, y: y}) if categorical: grouped df.groupby(x)[y].agg([sum, count]) grouped.columns [good, total] else: # 数值型分箱处理 df[x_bin] pd.qcut(df[x], qself.bins, duplicatesdrop) grouped df.groupby(x_bin)[y].agg([sum, count]) grouped.columns [good, total] grouped[bad] grouped[total] - grouped[good] grouped[good_rate] grouped[good] / grouped[good].sum() grouped[bad_rate] grouped[bad] / grouped[bad].sum() # 避免除以0 grouped[good_rate] grouped[good_rate].replace(0, 0.0001) grouped[bad_rate] grouped[bad_rate].replace(0, 0.0001) grouped[woe] np.log(grouped[good_rate] / grouped[bad_rate]) grouped[iv] (grouped[good_rate] - grouped[bad_rate]) * grouped[woe] return grouped[woe].to_dict(), grouped[iv].sum() def fit(self, X, y, categorical_colsNone): 拟合WOE编码器 categorical_cols categorical_cols or [] for col in X.columns: is_cat col in categorical_cols woe_map, iv self._calc_woe_iv(X[col], y, categoricalis_cat) self.woe_dict[col] woe_map self.iv_dict[col] iv return self def get_iv_report(self): 获取各特征IV值报告 iv_df pd.DataFrame({ feature: list(self.iv_dict.keys()), iv: list(self.iv_dict.values()) }).sort_values(iv, ascendingFalse) # IV值判断标准 def iv_judgment(iv): if iv 0.02: return 无预测能力 elif iv 0.1: return 弱预测能力 elif iv 0.3: return 中等预测能力 else: return 强预测能力 iv_df[judgment] iv_df[iv].apply(iv_judgment) return iv_df # 使用WOE编码器 from sklearn.model_selection import train_test_split X df.drop(risk, axis1) y df[risk] X_train, X_test, y_train, y_test train_test_split(X, y, test_size0.3, random_state42, stratifyy) categorical_cols [housing, saving_accts, checking_acct, purpose] woe_encoder WOEEncoder(bins5) # 注意这里用训练集fit避免数据泄露 # woe_encoder.fit(X_train, y_train, categorical_colscategorical_cols) # iv_report woe_encoder.get_iv_report() # print(iv_report)三、模型训练逻辑回归 vs 随机森林 vs XGBoost信用评分领域最常用逻辑回归因为可解释性强但实际项目中会对比多个模型。from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.pipeline import Pipeline from sklearn.metrics import roc_auc_score, classification_report import xgboost as xgb # 数据预处理编码分类变量 def preprocess_data(df, categorical_cols): df_processed df.copy() le LabelEncoder() for col in categorical_cols: df_processed[col] le.fit_transform(df_processed[col].astype(str)) return df_processed categorical_cols [housing, saving_accts, checking_acct, purpose] X_processed preprocess_data(X, categorical_cols) X_train, X_test, y_train, y_test train_test_split( X_processed, y, test_size0.3, random_state42, stratifyy ) # 模型1逻辑回归标配可解释性强 lr_pipeline Pipeline([ (scaler, StandardScaler()), (model, LogisticRegression(C1.0, max_iter1000, random_state42)) ]) lr_pipeline.fit(X_train, y_train) lr_pred_proba lr_pipeline.predict_proba(X_test)[:, 1] lr_auc roc_auc_score(y_test, lr_pred_proba) # 模型2随机森林非线性特征重要性好 rf_model RandomForestClassifier( n_estimators100, max_depth6, min_samples_leaf20, # 信用模型建议增大min_samples_leaf防过拟合 random_state42, n_jobs-1 ) rf_model.fit(X_train, y_train) rf_pred_proba rf_model.predict_proba(X_test)[:, 1] rf_auc roc_auc_score(y_test, rf_pred_proba) # 模型3XGBoost工业界首选的强模型 xgb_model xgb.XGBClassifier( n_estimators100, max_depth4, learning_rate0.1, subsample0.8, colsample_bytree0.8, use_label_encoderFalse, eval_metricauc, random_state42 ) xgb_model.fit(X_train, y_train, eval_set[(X_test, y_test)], verboseFalse) xgb_pred_proba xgb_model.predict_proba(X_test)[:, 1] xgb_auc roc_auc_score(y_test, xgb_pred_proba) print(*50) print(f模型对比测试集AUC) print(f 逻辑回归 {lr_auc:.4f}) print(f 随机森林 {rf_auc:.4f}) print(f XGBoost {xgb_auc:.4f}) print(*50)四、模型评估风控核心指标 AUC、KS、PSI在风控领域光有AUC不够。还需要KS值评估模型区分能力和PSI评估模型稳定性。from sklearn.metrics import roc_curve import scipy.stats as stats def calc_ks(y_true, y_prob): 计算KS值Kolmogorov-Smirnov统计量 KS max(TPR - FPR) 判断标准KS0.2 有价值0.3 良好0.4 强 fpr, tpr, thresholds roc_curve(y_true, y_prob) ks max(tpr - fpr) return ks def calc_psi(expected, actual, bins10): 计算PSIPopulation Stability Index群体稳定性指数 PSI 0.1: 模型稳定 0.1-0.25: 轻微变化需关注 0.25: 显著变化需重建模型 expected_pct, _ np.histogram(expected, binsbins, densityTrue) actual_pct, _ np.histogram(actual, binsbins, densityTrue) # 避免0值 expected_pct np.where(expected_pct 0, 0.0001, expected_pct) actual_pct np.where(actual_pct 0, 0.0001, actual_pct) # 归一化 expected_pct expected_pct / expected_pct.sum() actual_pct actual_pct / actual_pct.sum() psi np.sum((actual_pct - expected_pct) * np.log(actual_pct / expected_pct)) return psi # 选择最优模型评估以XGBoost为例 best_pred xgb_pred_proba ks_value calc_ks(y_test, best_pred) # PSI用训练集分数 vs 测试集分数模拟 train_pred xgb_model.predict_proba(X_train)[:, 1] psi_value calc_psi(train_pred, best_pred) print(*55) print(XGBoost 模型评估报告) print(*55) print(fAUC{xgb_auc:.4f} 判断标准0.7 有效0.8 优秀) print(fKS {ks_value:.4f} 判断标准0.2 有价值0.3 良好) print(fPSI{psi_value:.4f} 判断标准0.1 稳定0.1-0.25 需关注) print(*55) # 特征重要性 feature_importance pd.DataFrame({ feature: X_processed.columns, importance: xgb_model.feature_importances_ }).sort_values(importance, ascendingFalse) print(\n特征重要性排名) print(feature_importance.to_string(indexFalse))五、评分卡生成将模型分数转为标准评分在实际风控系统中模型输出的0-1概率值需要转换成标准评分如300-850分制方便业务理解和决策。def prob_to_score(prob, base_score600, pdo20, base_odds1.0): 将概率值转换为标准评分卡分数 参数 - base_score: 基础分通常600 - pdo: Points to Double the Odds每增加20分好坏比翻倍 - base_odds: 基础好坏比 公式Score base_score - PDO/ln(2) * ln(p/(1-p)) PDO/ln(2) * ln(base_odds) # 转换为log odds log_odds np.log(prob / (1 - prob 1e-10)) # 转换为分数 factor pdo / np.log(2) offset base_score - factor * np.log(base_odds) score offset - factor * log_odds # 截断到合理范围 score np.clip(score, 300, 850) return score.astype(int) # 生成评分 test_scores prob_to_score(best_pred) # 评分分布分析 score_df pd.DataFrame({ score: test_scores, actual_label: y_test.values, pred_prob: best_pred }) print(\n评分分布统计) print(f最低分{test_scores.min()}) print(f最高分{test_scores.max()}) print(f平均分{test_scores.mean():.1f}) print(f中位数{np.median(test_scores):.1f}) # 按评分段分析坏账率 score_bins pd.cut(test_scores, bins[300, 450, 500, 550, 600, 650, 700, 850]) score_analysis score_df.groupby(score_bins).agg( count(actual_label, count), bad_count(actual_label, lambda x: (x 0).sum()), good_count(actual_label, lambda x: (x 1).sum()) ) score_analysis[bad_rate] score_analysis[bad_count] / score_analysis[count] score_analysis[cumulative_capture] score_analysis[bad_count].cumsum() / score_analysis[bad_count].sum() print(\n评分段分析核心指标) print(score_analysis.round(4).to_string())六、模型落地实时预测接口最后把模型封装成可调用的预测函数模拟实时风控评分class CreditScoringModel: 信用评分模型封装类 支持单条记录实时评分 def __init__(self, model, feature_cols, categorical_cols, base_score600, pdo20): self.model model self.feature_cols feature_cols self.categorical_cols categorical_cols self.base_score base_score self.pdo pdo self.le_dict {} def fit_encoders(self, X): 拟合标签编码器 for col in self.categorical_cols: le LabelEncoder() le.fit(X[col].astype(str)) self.le_dict[col] le return self def predict_score(self, sample_dict): 输入单条申请记录字典格式 输出信用评分 风险等级 # 转为DataFrame sample_df pd.DataFrame([sample_dict]) # 编码分类变量 for col in self.categorical_cols: if col in sample_df.columns and col in self.le_dict: try: sample_df[col] self.le_dict[col].transform(sample_df[col].astype(str)) except ValueError: sample_df[col] 0 # 未知类别处理 # 预测概率 prob self.model.predict_proba(sample_df[self.feature_cols])[0, 1] # 转换为评分 score prob_to_score(np.array([prob]), self.base_score, self.pdo)[0] # 风险等级划分 if score 650: risk_level 低风险 suggestion 建议通过可提供标准额度 elif score 580: risk_level 中风险 suggestion 建议小额通过加强贷后监控 elif score 480: risk_level 高风险 suggestion 建议拒绝或需补充征信材料 else: risk_level 极高风险 suggestion 建议直接拒绝 return { score: int(score), probability: round(float(prob), 4), risk_level: risk_level, suggestion: suggestion } # 初始化评分模型 scoring_model CreditScoringModel( modelxgb_model, feature_colsX_processed.columns.tolist(), categorical_colscategorical_cols ) scoring_model.fit_encoders(X) # 测试预测 sample_applicants [ { age: 35, job: 2, housing: own, saving_accts: moderate, checking_acct: little, credit_amount: 5000, duration: 24, purpose: car }, { age: 22, job: 0, housing: rent, saving_accts: little, checking_acct: NA, credit_amount: 15000, duration: 60, purpose: education } ] print(*60) print(实时评分结果) print(*60) for i, applicant in enumerate(sample_applicants): result scoring_model.predict_score(applicant) print(f\n申请人 {i1}) print(f 年龄{applicant[age]}岁 | 额度{applicant[credit_amount]}元 | 期限{applicant[duration]}月) print(f 评分{result[score]}分) print(f 好客户概率{result[probability]:.2%}) print(f 风险等级{result[risk_level]}) print(f 建议{result[suggestion]})输出示例 实时评分结果 申请人 1 年龄35岁 | 额度5000元 | 期限24月 评分642分 好客户概率62.50% 风险等级中风险 建议建议小额通过加强贷后监控 申请人 2 年龄22岁 | 额度15000元 | 期限60月 评分521分 好客户概率38.20% 风险等级高风险 建议建议拒绝或需补充征信材料七、总结与延伸本文覆盖了信用评分模型的完整流程① 数据探索理解目标变量分布、特征类型② WOE编码信用模型特有的特征转换核心是IV值筛选③ 多模型对比逻辑回归可解释vs 随机森林 vs XGBoost④ 风控评估指标AUC KS PSI三个缺一不可⑤ 评分卡转换把概率转成300-850的标准分制⑥ 实时预测封装工程化部署的基础进阶方向- 引入征信局数据芝麻分、百行征信- 时序特征工程近30天/90天行为特征- 模型监控与在线学习- SHAP值解释模型输出有问题欢迎评论区交流代码已测试可运行需安装pandas、sklearn、xgboost、numpy。