Skip to content

Commit b318237

Browse files
authored
Merge pull request #88 from SupervisedStylometry/parseTEI-recovered
TEI Loading Enhancement & Config-Based Corpus Creation
2 parents 8066bda + d680b55 commit b318237

8 files changed

Lines changed: 745 additions & 41 deletions

File tree

load_corpus.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,22 @@
11
from superstyl.load import load_corpus
2+
from superstyl.load_from_config import load_corpus_from_config
23
import json
34

45
# TODO: eliminate features that occur only n times ?
56
# Do the Moisl Selection ?
7+
# TODO: document the new 'lemma' feat for TEI loading
68

79
if __name__ == '__main__':
810

911
import argparse
1012

1113
parser = argparse.ArgumentParser()
12-
parser.add_argument('-s', nargs='+', help="paths to files", required=True)
14+
parser.add_argument('-s', nargs='+', help="paths to files or to json config file", required=True)
15+
parser.add_argument('--json', action='store_true', help="indicates that the path provided with -s is a JSON config file, "
16+
"containing all the options to load the corpus/features")
1317
parser.add_argument('-o', action='store', help="optional base name of output files", type=str, default=False)
14-
parser.add_argument('-f', action="store", help="optional list of features, either in json (generated by Superstyl) or simple txt (one word per line)", default=False)
18+
parser.add_argument('-f', action="store", help="optional list of features, either in json (generated by"
19+
" Superstyl) or simple txt (one word per line)", default=False)
1520
parser.add_argument('-t', action='store', help="types of features (words, chars, affixes - "
1621
"as per Sapkota et al. 2015 - or pos). pos are currently"
1722
"only implemented for Modern English", type=str,
@@ -76,16 +81,19 @@
7681
else:
7782
my_feats = None
7883

79-
corpus, my_feats = load_corpus(args.s, feat_list=my_feats, feats=args.t, n=args.n, k=args.k,
80-
freqsType=args.freqs, format=args.x,
81-
sampling=args.sampling, units=args.sample_units,
82-
size=args.sample_size, step=args.sample_step, max_samples=args.max_samples,
83-
samples_random=args.samples_random,
84-
keep_punct=args.keep_punct, keep_sym=args.keep_sym, no_ascii=args.no_ascii,
85-
identify_lang=args.identify_lang,
86-
embedding=args.embedding, neighbouring_size=args.neighbouring_size,
87-
culling=args.culling
88-
)
84+
if args.json:
85+
corpus, my_feats = load_corpus_from_config(args.s)
86+
else:
87+
corpus, my_feats = load_corpus(args.s, feat_list=my_feats, feats=args.t, n=args.n, k=args.k,
88+
freqsType=args.freqs, format=args.x,
89+
sampling=args.sampling, units=args.sample_units,
90+
size=args.sample_size, step=args.sample_step, max_samples=args.max_samples,
91+
samples_random=args.samples_random,
92+
keep_punct=args.keep_punct, keep_sym=args.keep_sym, no_ascii=args.no_ascii,
93+
identify_lang=args.identify_lang,
94+
embedding=args.embedding, neighbouring_size=args.neighbouring_size,
95+
culling=args.culling
96+
)
8997

9098
print(".......saving results.......")
9199

superstyl/load.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp
1717
:param feats: the type of features, one of 'words', 'chars', 'affixes, and 'POS'. Affixes are inspired by
1818
Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and, if keep_pos, punctuation n-grams.
1919
POS are currently only implemented for Modern English
20+
TODO: add met_line, met_syll
2021
:param n: n grams lengths (default 1)
2122
:param k: How many most frequent? The function takes the rank of k (if k is smaller than the total number of features),
2223
gets its frequencies, and only include features of superior or equal total frequencies.
@@ -55,13 +56,13 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp
5556
print(".......loading texts.......")
5657

5758
if sampling:
58-
myTexts = pipe.docs_to_samples(data_paths, format=format, units=units, size=size, step=step,
59+
myTexts = pipe.docs_to_samples(data_paths, feats=feats, format=format, units=units, size=size, step=step,
5960
max_samples=max_samples, samples_random=samples_random,
6061
keep_punct=keep_punct, keep_sym=keep_sym, no_ascii=no_ascii,
6162
identify_lang = identify_lang)
6263

6364
else:
64-
myTexts = pipe.load_texts(data_paths, format=format, max_samples=max_samples, keep_punct=keep_punct,
65+
myTexts = pipe.load_texts(data_paths, feats=feats, format=format, max_samples=max_samples, keep_punct=keep_punct,
6566
keep_sym=keep_sym, no_ascii=no_ascii, identify_lang=identify_lang)
6667

6768
print(".......getting features.......")

superstyl/load_from_config.py

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
import json
2+
import superstyl
3+
import pandas as pd
4+
import os
5+
6+
from superstyl.load import load_corpus
7+
8+
def load_corpus_from_config(config_path):
9+
"""
10+
Load a corpus based on a JSON configuration file.
11+
12+
Parameters:
13+
-----------
14+
config_path : str
15+
Path to the JSON configuration file
16+
17+
Returns:
18+
--------
19+
tuple: (corpus, feat_list) - Same format as load_corpus function
20+
If multiple features are defined, returns the merged corpus and the combined feature list
21+
If only one feature is defined, returns that corpus and its feature list
22+
"""
23+
# Load configuration
24+
if not config_path.endswith('.json'):
25+
raise ValueError(f"Unsupported configuration file format: {config_path}. Only JSON format is supported.")
26+
27+
with open(config_path, 'r') as f:
28+
config = json.load(f)
29+
30+
# Get corpus paths
31+
if 'paths' in config:
32+
if isinstance(config['paths'], list):
33+
paths = config['paths']
34+
elif isinstance(config['paths'], str):
35+
paths = [config['paths']]
36+
else:
37+
raise ValueError("Paths in config must be either a list or a glob pattern string")
38+
else:
39+
raise ValueError("No paths provided and no paths found in config")
40+
41+
# Get sampling parameters
42+
sampling_params = config.get('sampling', {})
43+
44+
# Use the first feature to create the base corpus with sampling
45+
feature_configs = config.get('features', [])
46+
if not feature_configs:
47+
raise ValueError("No features specified in the configuration")
48+
49+
# If there's only one feature, we can simply return the result of load_corpus
50+
if len(feature_configs) == 1:
51+
feature_config = feature_configs[0]
52+
feature_name = feature_config.get('name', "f1")
53+
54+
# Check for feature list file
55+
feat_list = None
56+
feat_list_path = feature_config.get('feat_list')
57+
if feat_list_path:
58+
if feat_list_path.endswith('.json'):
59+
with open(feat_list_path, 'r') as f:
60+
feat_list = json.load(f)
61+
elif feat_list_path.endswith('.txt'):
62+
with open(feat_list_path, 'r') as f:
63+
feat_list = [[feat.strip(), 0] for feat in f.readlines()]
64+
65+
# Set up other parameters
66+
params = {
67+
'feats': feature_config.get('type', 'words'),
68+
'n': feature_config.get('n', 1),
69+
'k': feature_config.get('k', 5000),
70+
'freqsType': feature_config.get('freq_type', 'relative'),
71+
'format': config.get('format', 'txt'),
72+
'sampling': sampling_params.get('enabled', False),
73+
'units': sampling_params.get('units', 'words'),
74+
'size': sampling_params.get('sample_size', 3000),
75+
'step': sampling_params.get('sample_step', None),
76+
'max_samples': sampling_params.get('max_samples', None),
77+
'samples_random': sampling_params.get('sample_random', False),
78+
'keep_punct': feature_config.get('keep_punct', False),
79+
'keep_sym': feature_config.get('keep_sym', False),
80+
'no_ascii': feature_config.get('no_ascii', False),
81+
'identify_lang': feature_config.get('identify_lang', False),
82+
'embedding': feature_config.get('embedding', None),
83+
'neighbouring_size': feature_config.get('neighbouring_size', 10),
84+
'culling': feature_config.get('culling', 0)
85+
}
86+
87+
print(f"Loading corpus with {feature_name}...")
88+
corpus, features = load_corpus(paths, feat_list=feat_list, **params)
89+
90+
return corpus, features
91+
92+
# For multiple features, we need to process each one and merge the results
93+
corpora = {}
94+
feature_lists = {}
95+
96+
# Process each feature configuration
97+
for i, feature_config in enumerate(feature_configs):
98+
feature_name = feature_config.get('name', f"f{i+1}")
99+
100+
# Check for feature list file
101+
feat_list = None
102+
feat_list_path = feature_config.get('feat_list')
103+
if feat_list_path:
104+
if feat_list_path.endswith('.json'):
105+
with open(feat_list_path, 'r') as f:
106+
feat_list = json.load(f)
107+
elif feat_list_path.endswith('.txt'):
108+
with open(feat_list_path, 'r') as f:
109+
feat_list = [[feat.strip(), 0] for feat in f.readlines()]
110+
111+
# Set up other parameters
112+
params = {
113+
'feats': feature_config.get('type', 'words'),
114+
'n': feature_config.get('n', 1),
115+
'k': feature_config.get('k', 5000),
116+
'freqsType': feature_config.get('freq_type', 'relative'),
117+
'format': config.get('format', 'txt'),
118+
'sampling': sampling_params.get('enabled', False),
119+
'units': sampling_params.get('units', 'words'),
120+
'size': sampling_params.get('sample_size', 3000),
121+
'step': sampling_params.get('sample_step', None),
122+
'max_samples': sampling_params.get('max_samples', None),
123+
'samples_random': sampling_params.get('sample_random', False),
124+
'keep_punct': config.get('keep_punct', False),
125+
'keep_sym': config.get('keep_sym', False),
126+
'no_ascii': config.get('no_ascii', False),
127+
'identify_lang': config.get('identify_lang', False),
128+
'embedding': feature_config.get('embedding', None),
129+
'neighbouring_size': feature_config.get('neighbouring_size', 10),
130+
'culling': feature_config.get('culling', 0)
131+
}
132+
133+
print(f"Loading {feature_name}...")
134+
corpus, features = load_corpus(paths, feat_list=feat_list, **params)
135+
136+
# Store corpus and features
137+
corpora[feature_name] = corpus
138+
feature_lists[feature_name] = features
139+
140+
# Create a merged dataset
141+
print("Creating merged dataset...")
142+
first_corpus_name = next(iter(corpora))
143+
144+
# Start with metadata from the first corpus
145+
metadata = corpora[first_corpus_name][['author', 'lang']]
146+
147+
# Create an empty DataFrame for the merged corpus
148+
merged = pd.DataFrame(index=metadata.index)
149+
150+
# Add metadata
151+
merged = pd.concat([metadata, merged], axis=1)
152+
153+
# Combine all features with prefixes to avoid name collisions
154+
all_features = []
155+
156+
# Add features from each corpus
157+
for name, corpus in corpora.items():
158+
feature_cols = [col for col in corpus.columns if col not in ['author', 'lang']]
159+
160+
# Rename columns to avoid duplicates
161+
renamed_cols = {col: f"{name}_{col}" for col in feature_cols}
162+
feature_df = corpus[feature_cols].rename(columns=renamed_cols)
163+
164+
# Merge with the main DataFrame
165+
merged = pd.concat([merged, feature_df], axis=1)
166+
167+
# Add features to the combined list with prefixes
168+
for feature in feature_lists[name]:
169+
all_features.append((f"{name}_{feature[0]}", feature[1]))
170+
171+
# Return the merged corpus and combined feature list
172+
return merged, all_features
173+

superstyl/preproc/features_extract.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,15 @@ def count_features(text, feats ="words", n = 1):
2020
raise ValueError("Text cannot be empty.")
2121
if n < 1 or not isinstance(n, int):
2222
raise ValueError("n must be a positive integer.")
23-
if feats not in ["words", "chars", "affixes", "pos"]:
24-
raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', or 'pos'.")
23+
if feats not in ["words", "chars", "affixes", "pos", "met_line", "met_syll"]:
24+
raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', or 'pos'.")
2525
if feats == "words":
2626
tokens = nltk.tokenize.wordpunct_tokenize(text)
2727
if n > 1:
2828
tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))]
2929
total = len(tokens)
3030

31-
elif feats == "chars":
31+
elif feats in ["chars", "met_syll"]:
3232
tokens = [re.sub(r'\p{Z}', '_', ''.join(ngram)) for ngram in nltk.ngrams(text, n)]
3333
total = len(tokens)
3434

@@ -60,14 +60,20 @@ def count_features(text, feats ="words", n = 1):
6060
tokens = pos_tags
6161
total = len(tokens)
6262

63+
elif feats == "met_line":
64+
tokens = text.split()
65+
if n > 1:
66+
tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))]
67+
total = len(tokens)
68+
6369
# Adding sentence length ; still commented as it is a work in progress, an integer won't do, a quantile would be better
6470
#elif feats == "sentenceLength":
6571
# sentences = nltk.tokenize.sent_tokenize(text)
6672
# tokens = [str(len(nltk.tokenize.word_tokenize(sentence))) for sentence in sentences]
6773

6874
#Adding an error message in case some distracted guy like me would enter something wrong:
6975
else:
70-
raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes' or 'pos'.")
76+
raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll' or 'pos'.")
7177

7278
counts = Counter()
7379
counts.update(tokens)
@@ -102,7 +108,7 @@ def get_feature_list(myTexts, feats="words", n=1, freqsType="relative"):
102108
"""
103109
:param myTexts: a 'myTexts' object, containing documents to be processed
104110
:param feat_list: a list of features to be selected
105-
:param feats: type of feats (words, chars, affixes or POS)
111+
:param feats: type of feats (words, chars, affixes, POS, met_line, or met_syll)
106112
:param freqsType: "relative", "absolute" or "binary" frequencies
107113
:param n: n-grams length
108114
:return: list of features, with total frequency

0 commit comments

Comments
 (0)