SuperStyl/superstyl/load.py at 0fbea39cbfce23dde5be265879c8246b21034f21 · SupervisedStylometry/SuperStyl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import superstyl.preproc.pipe as pipe
import superstyl.preproc.features_extract as fex
from superstyl.preproc.text_count import count_process
import superstyl.preproc.embedding as embed
import tqdm
import pandas
from typing import Optional, List, Tuple, Union

from superstyl.config import Config, FeatureConfig


def _load_single_feature(
    myTexts: List[dict],
    feat_config: FeatureConfig,
    use_provided_feat_list: bool = False,
) -> Tuple[pandas.DataFrame, List]:
    """
    Extract features for a single FeatureConfig.
    Internal function used by load_corpus.

    Args:
        use_provided_feat_list: If True and feat_config.feat_list is provided,
            return that list instead of the computed one. Used for test sets
            to ensure same features as training set.
    """
    feats = feat_config.type
    n = feat_config.n
    k = feat_config.k
    freqsType = feat_config.freq_type
    provided_feat_list = feat_config.feat_list
    embedding = feat_config.embedding
    neighbouring_size = feat_config.neighbouring_size
    culling = feat_config.culling

    embeddedFreqs = False
    if embedding:
        print(".......loading embedding.......")
        model = embed.load_embeddings(embedding)
        embeddedFreqs = True
        freqsType = "absolute"

    print(f".......getting features ({feats}, n={n}).......")

    if provided_feat_list is None:
        feat_list = fex.get_feature_list(myTexts, feats=feats, n=n, freqsType=freqsType)
        if k > len(feat_list):
            print(f"K limit ignored ({len(feat_list)} < {k})")
        else:
            val = feat_list[k-1][1]
            feat_list = [m for m in feat_list if m[1] >= val]
    else:
        feat_list = provided_feat_list

    print(".......getting counts.......")

    my_feats = [m[0] for m in feat_list]
    # Copy myTexts to avoid mutating original for multi-feature
    texts_copy = [dict(t) for t in myTexts]
    texts_copy = fex.get_counts(texts_copy, feat_list=my_feats, feats=feats, n=n, freqsType=freqsType)

    if embedding:
        print(".......embedding counts.......")
        texts_copy, my_feats = embed.get_embedded_counts(texts_copy, my_feats, model, topn=neighbouring_size)
        feat_list = [f for f in feat_list if f[0] in my_feats]

    if culling > 0:
        print(f".......Culling at {culling}%.......")
        feats_doc_freq = fex.get_doc_frequency(texts_copy)
        my_feats = [f for f in my_feats if (feats_doc_freq[f] / len(texts_copy) * 100) > culling]
        feat_list = [f for f in feat_list if f[0] in my_feats]

    print(".......feeding data frame.......")

    loc = {}
    for t in tqdm.tqdm(texts_copy):
        text, local_freqs = count_process((t, my_feats), embeddedFreqs=embeddedFreqs)
        loc[text["name"]] = local_freqs

    feats_df = pandas.DataFrame.from_dict(loc, columns=list(my_feats), orient="index")

    # For test sets: return the provided feat_list unchanged
    if use_provided_feat_list and provided_feat_list is not None:
        return feats_df, provided_feat_list

    return feats_df, feat_list


def load_corpus(
    config: Optional[Config] = None,
    use_provided_feat_list: bool = False,
    **kwargs
) -> Tuple[pandas.DataFrame, Union[List, List[List]]]:
    """
    Load a corpus and extract features.

    Can be called with:
    1. A Config object: load_corpus(config=my_config)
    2. Individual parameters (backward compatible):
       load_corpus(data_paths=paths, feats="chars", n=3)

    Args:
        config: Configuration object. If None, built from kwargs.
        use_provided_feat_list: If True and feat_list provided, return it unchanged.
                               Use for test sets to match training features.
        **kwargs: Individual parameters for backward compatibility.
                  Supported: data_paths, feat_list, feats, n, k, freqsType,
                  format, sampling, units, size, step, max_samples, samples_random,
                  keep_punct, keep_sym, no_ascii, identify_lang, embedding,
                  neighbouring_size, culling

    Returns:
        - If single feature: (DataFrame, feat_list)
        - If multiple features: (DataFrame with prefixed columns, list of feat_lists)
    """
    # Build config from kwargs if not provided
    if config is None:
        config = Config.from_kwargs(**kwargs)

    # Validate configuration
    config.validate()
    data_paths = config.corpus.paths

    # Handle string paths (single file or glob pattern)
    if isinstance(data_paths, str):
        import glob
        # If it's a glob pattern, expand it
        if '*' in data_paths or '?' in data_paths:
            data_paths = sorted(glob.glob(data_paths))
        else:
            # Single file path - wrap in list
            data_paths = [data_paths]

    # Validate
    for feat_config in config.features:
        if feat_config.type in ('lemma', 'pos', 'met_line', 'met_syll') and config.corpus.format != 'tei':
            raise ValueError(f"{feat_config.type} requires TEI format.")
        if feat_config.type in ('met_line', 'met_syll') and config.sampling.units != 'verses':
            raise ValueError(f"{feat_config.type} verses lines units.")
    data_paths = config.corpus.paths

    # Handle string paths (single file or glob pattern)
    if isinstance(data_paths, str):
        import glob
        # If it's a glob pattern, expand it
        if '*' in data_paths or '?' in data_paths:
            data_paths = sorted(glob.glob(data_paths))
        else:
            # Single file path - wrap in list
            data_paths = [data_paths]

    # Validate
    for feat_config in config.features:
        if feat_config.type in ('lemma', 'pos', 'met_line', 'met_syll') and config.corpus.format != 'tei':
            raise ValueError(f"{feat_config.type} requires TEI format.")
        if feat_config.type in ('met_line', 'met_syll') and config.sampling.units != 'verses':
            raise ValueError(f"{feat_config.type} requires verses units.")

    # Load texts once
    print(".......loading texts.......")

    if config.sampling.enabled:
        myTexts = pipe.docs_to_samples(
            data_paths,
            config=config
        )
    else:
        myTexts = pipe.load_texts(
            data_paths,
            config=config
        )

    unique_texts = [text["name"] for text in myTexts]

    # Build metadata
    metadata = pandas.DataFrame(
        columns=['author', 'lang'],
        index=unique_texts,
        data=[[t["aut"], t["lang"]] for t in myTexts]
    )

    # Single feature case
    if len(config.features) == 1:
        feat_config = config.features[0]
        feats_df, feat_list = _load_single_feature(
            myTexts, feat_config, use_provided_feat_list
        )
        corpus = pandas.concat([metadata, feats_df], axis=1)
        return corpus, feat_list

    # Multiple features case
    print(f".......extracting {len(config.features)} feature sets.......")

    all_feat_lists = []
    merged_feats = metadata.copy()

    for i, feat_config in enumerate(config.features):
        prefix = feat_config.name or f"f{i+1}"
        print(f".......processing {prefix}.......")

        feats_df, feat_list = _load_single_feature(
            myTexts, feat_config, use_provided_feat_list
        )

        # Prefix columns to avoid collisions
        feats_df = feats_df.rename(columns={col: f"{prefix}_{col}" for col in feats_df.columns})

        merged_feats = pandas.concat([merged_feats, feats_df], axis=1)
        all_feat_lists.append(feat_list)

    return merged_feats, all_feat_lists