|
| 1 | +# Copyright (c) Microsoft Corporation. |
| 2 | +# Licensed under the MIT License. |
| 3 | +import pandas as pd |
| 4 | +import numpy as np |
| 5 | +from qlib.contrib.report.data.base import FeaAnalyser |
| 6 | +from qlib.contrib.report.utils import sub_fig_generator |
| 7 | +from qlib.utils.paral import datetime_groupby_apply |
| 8 | +from qlib.contrib.eva.alpha import pred_autocorr_all |
| 9 | +from loguru import logger |
| 10 | +import seaborn as sns |
| 11 | + |
| 12 | +DT_COL_NAME = "datetime" |
| 13 | + |
| 14 | + |
| 15 | +class CombFeaAna(FeaAnalyser): |
| 16 | + """ |
| 17 | + Combine the sub feature analysers and plot then in a single graph |
| 18 | + """ |
| 19 | + |
| 20 | + def __init__(self, dataset: pd.DataFrame, *fea_ana_cls): |
| 21 | + if len(fea_ana_cls) <= 1: |
| 22 | + raise NotImplementedError(f"This type of input is not supported") |
| 23 | + self._fea_ana_l = [fcls(dataset) for fcls in fea_ana_cls] |
| 24 | + super().__init__(dataset=dataset) |
| 25 | + |
| 26 | + def skip(self, col): |
| 27 | + return np.all(list(map(lambda fa: fa.skip(col), self._fea_ana_l))) |
| 28 | + |
| 29 | + def calc_stat_values(self): |
| 30 | + """The statistics of features are finished in the underlying analysers""" |
| 31 | + |
| 32 | + def plot_all(self, *args, **kwargs): |
| 33 | + |
| 34 | + ax_gen = iter(sub_fig_generator(row_n=len(self._fea_ana_l), *args, **kwargs)) |
| 35 | + |
| 36 | + for col in self._dataset: |
| 37 | + if not self.skip(col): |
| 38 | + axes = next(ax_gen) |
| 39 | + for fa, ax in zip(self._fea_ana_l, axes): |
| 40 | + if not fa.skip(col): |
| 41 | + fa.plot_single(col, ax) |
| 42 | + ax.set_xlabel("") |
| 43 | + ax.set_title("") |
| 44 | + axes[0].set_title(col) |
| 45 | + |
| 46 | + |
| 47 | +class NumFeaAnalyser(FeaAnalyser): |
| 48 | + def skip(self, col): |
| 49 | + is_obj = np.issubdtype(self._dataset[col], np.dtype("O")) |
| 50 | + if is_obj: |
| 51 | + logger.info(f"{col} is not numeric and is skipped") |
| 52 | + return is_obj |
| 53 | + |
| 54 | + |
| 55 | +class ValueCNT(FeaAnalyser): |
| 56 | + def __init__(self, dataset: pd.DataFrame, ratio=False): |
| 57 | + self.ratio = ratio |
| 58 | + super().__init__(dataset) |
| 59 | + |
| 60 | + def calc_stat_values(self): |
| 61 | + self._val_cnt = {} |
| 62 | + for col, item in self._dataset.items(): |
| 63 | + if not super().skip(col): |
| 64 | + self._val_cnt[col] = item.groupby(DT_COL_NAME).apply(lambda s: len(s.unique())) |
| 65 | + self._val_cnt = pd.DataFrame(self._val_cnt) |
| 66 | + if self.ratio: |
| 67 | + self._val_cnt = self._val_cnt.div(self._dataset.groupby(DT_COL_NAME).size(), axis=0) |
| 68 | + |
| 69 | + # TODO: 这部分其他的也可以借鉴 |
| 70 | + ymin, ymax = self._val_cnt.min().min(), self._val_cnt.max().max() |
| 71 | + self.ylim = (ymin - 0.05 * (ymax - ymin), ymax + 0.05 * (ymax - ymin)) |
| 72 | + |
| 73 | + def plot_single(self, col, ax): |
| 74 | + self._val_cnt[col].plot(ax=ax, title=col, ylim=self.ylim) |
| 75 | + ax.set_xlabel("") |
| 76 | + |
| 77 | + |
| 78 | +class FeaDistAna(NumFeaAnalyser): |
| 79 | + def plot_single(self, col, ax): |
| 80 | + sns.histplot(self._dataset[col], ax=ax, kde=False, bins=100) |
| 81 | + ax.set_xlabel("") |
| 82 | + ax.set_title(col) |
| 83 | + |
| 84 | + |
| 85 | +class FeaInfAna(NumFeaAnalyser): |
| 86 | + def calc_stat_values(self): |
| 87 | + self._inf_cnt = {} |
| 88 | + for col, item in self._dataset.items(): |
| 89 | + if not super().skip(col): |
| 90 | + self._inf_cnt[col] = item.apply(np.isinf).astype(np.int).groupby(DT_COL_NAME).sum() |
| 91 | + self._inf_cnt = pd.DataFrame(self._inf_cnt) |
| 92 | + |
| 93 | + def skip(self, col): |
| 94 | + return (col not in self._inf_cnt) or (self._inf_cnt[col].sum() == 0) |
| 95 | + |
| 96 | + def plot_single(self, col, ax): |
| 97 | + self._inf_cnt[col].plot(ax=ax, title=col) |
| 98 | + ax.set_xlabel("") |
| 99 | + |
| 100 | + |
| 101 | +class FeaNanAna(FeaAnalyser): |
| 102 | + def calc_stat_values(self): |
| 103 | + self._nan_cnt = self._dataset.isna().groupby(DT_COL_NAME).sum() |
| 104 | + |
| 105 | + def skip(self, col): |
| 106 | + return (col not in self._nan_cnt) or (self._nan_cnt[col].sum() == 0) |
| 107 | + |
| 108 | + def plot_single(self, col, ax): |
| 109 | + self._nan_cnt[col].plot(ax=ax, title=col) |
| 110 | + ax.set_xlabel("") |
| 111 | + |
| 112 | + |
| 113 | +class FeaNanAnaRatio(FeaAnalyser): |
| 114 | + def calc_stat_values(self): |
| 115 | + self._nan_cnt = self._dataset.isna().groupby(DT_COL_NAME).sum() |
| 116 | + self._total_cnt = self._dataset.groupby(DT_COL_NAME).size() |
| 117 | + |
| 118 | + def skip(self, col): |
| 119 | + return (col not in self._nan_cnt) or (self._nan_cnt[col].sum() == 0) |
| 120 | + |
| 121 | + def plot_single(self, col, ax): |
| 122 | + (self._nan_cnt[col] / self._total_cnt).plot(ax=ax, title=col) |
| 123 | + ax.set_xlabel("") |
| 124 | + |
| 125 | + |
| 126 | +class FeaACAna(FeaAnalyser): |
| 127 | + """Analysis the auto-correlation of features""" |
| 128 | + |
| 129 | + def calc_stat_values(self): |
| 130 | + self._fea_corr = pred_autocorr_all(self._dataset.to_dict("series")) |
| 131 | + df = pd.DataFrame(self._fea_corr) |
| 132 | + ymin, ymax = df.min().min(), df.max().max() |
| 133 | + self.ylim = (ymin - 0.05 * (ymax - ymin), ymax + 0.05 * (ymax - ymin)) |
| 134 | + |
| 135 | + def plot_single(self, col, ax): |
| 136 | + self._fea_corr[col].plot(ax=ax, title=col, ylim=self.ylim) |
| 137 | + ax.set_xlabel("") |
| 138 | + |
| 139 | + |
| 140 | +class FeaSkewTurt(NumFeaAnalyser): |
| 141 | + def calc_stat_values(self): |
| 142 | + self._skew = datetime_groupby_apply(self._dataset, "skew", skip_group=True) |
| 143 | + self._kurt = datetime_groupby_apply(self._dataset, pd.DataFrame.kurt, skip_group=True) |
| 144 | + |
| 145 | + def plot_single(self, col, ax): |
| 146 | + self._skew[col].plot(ax=ax, label="skew") |
| 147 | + ax.set_xlabel("") |
| 148 | + ax.set_ylabel("skew") |
| 149 | + ax.legend() |
| 150 | + |
| 151 | + right_ax = ax.twinx() |
| 152 | + |
| 153 | + self._kurt[col].plot(ax=right_ax, label="kurt", color="green") |
| 154 | + right_ax.set_xlabel("") |
| 155 | + right_ax.set_ylabel("kurt") |
| 156 | + |
| 157 | + h1, l1 = ax.get_legend_handles_labels() |
| 158 | + h2, l2 = right_ax.get_legend_handles_labels() |
| 159 | + |
| 160 | + ax.legend().set_visible(False) |
| 161 | + right_ax.legend(h1 + h2, l1 + l2) |
| 162 | + ax.set_title(col) |
| 163 | + |
| 164 | + |
| 165 | +class FeaMeanStd(NumFeaAnalyser): |
| 166 | + def calc_stat_values(self): |
| 167 | + self._std = self._dataset.groupby(DT_COL_NAME).std() |
| 168 | + self._mean = self._dataset.groupby(DT_COL_NAME).mean() |
| 169 | + |
| 170 | + def plot_single(self, col, ax): |
| 171 | + self._mean[col].plot(ax=ax, label="mean") |
| 172 | + ax.set_xlabel("") |
| 173 | + ax.set_ylabel("mean") |
| 174 | + ax.legend() |
| 175 | + |
| 176 | + right_ax = ax.twinx() |
| 177 | + |
| 178 | + self._std[col].plot(ax=right_ax, label="std", color="green") |
| 179 | + right_ax.set_xlabel("") |
| 180 | + right_ax.set_ylabel("std") |
| 181 | + |
| 182 | + h1, l1 = ax.get_legend_handles_labels() |
| 183 | + h2, l2 = right_ax.get_legend_handles_labels() |
| 184 | + |
| 185 | + ax.legend().set_visible(False) |
| 186 | + right_ax.legend(h1 + h2, l1 + l2) |
| 187 | + ax.set_title(col) |
| 188 | + |
| 189 | + |
| 190 | +class RawFeaAna(FeaAnalyser): |
| 191 | + """ |
| 192 | + Motivation: |
| 193 | + - display the values without further analysis |
| 194 | + """ |
| 195 | + |
| 196 | + def calc_stat_values(self): |
| 197 | + ymin, ymax = self._dataset.min().min(), self._dataset.max().max() |
| 198 | + self.ylim = (ymin - 0.05 * (ymax - ymin), ymax + 0.05 * (ymax - ymin)) |
| 199 | + |
| 200 | + def plot_single(self, col, ax): |
| 201 | + self._dataset[col].plot(ax=ax, title=col, ylim=self.ylim) |
| 202 | + ax.set_xlabel("") |
0 commit comments