SupervisedStylometry · Jean-Baptiste-Camps · Jan 14, 2026 · Nov 27, 2025 · Nov 27, 2025 · Nov 28, 2025
diff --git a/.coverage b/.coverage
diff --git a/load_corpus.py b/load_corpus.py
@@ -1,117 +1,219 @@
 from superstyl.load import load_corpus
-from superstyl.load_from_config import load_corpus_from_config
+from superstyl.config import Config
 import json
 
 # TODO: eliminate features that occur only n times ?
 # Do the Moisl Selection ?
-# TODO: document the new 'lemma' feat for TEI loading
 
 if __name__ == '__main__':
 
     import argparse
 
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-s', nargs='+', help="paths to files or to json config file", required=True)
-    parser.add_argument('--json', action='store_true', help="indicates that the path provided with -s is a JSON config file, "
-    "containing all the options to load the corpus/features")
-    parser.add_argument('-o', action='store', help="optional base name of output files", type=str, default=False)
-    parser.add_argument('-f', action="store", help="optional list of features, either in json (generated by"
-                                                   " Superstyl) or simple txt (one word per line)", default=False)
-    parser.add_argument('-t', action='store', help="types of features (words, chars, affixes - "
-                                                   "as per Sapkota et al. 2015 -, as well as lemma or pos, met_line, "
-                                                   "met_syll (those four last only for TEI files with proper annotation)"
-                                                   , type=str,
-                        default="words", choices=["words", "chars", "affixes", "pos", "lemma", "met_line", "met_syll"])
-    parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int)
-    parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
-    parser.add_argument('--freqs', action='store', help="relative, absolute or binarised freqs",
+    parser = argparse.ArgumentParser(
+        description="Load a corpus and extract features for stylometric analysis."
+    )    
+    parser.add_argument('-s', 
+                        nargs='+', 
+                        help="paths to files or to json config file", 
+                        required=True
+                        )
+    parser.add_argument('--json', 
+                        action='store_true', 
+                        help="indicates that the path provided with -s is a JSON config file, "
+                        "containing all the options to load the corpus/features"
+                        )
+    parser.add_argument('-o', 
+                        action='store', 
+                        help="optional base name of output files", 
+                        type=str, 
+                        default=False
+                        )
+    # Feature list
+    parser.add_argument('-f', 
+                        action="store", 
+                        help="optional list of features, either in json (generated by"
+                        " Superstyl) or simple txt (one word per line)", 
+                        default=False
+                        )
+    parser.add_argument('-t', 
+                        action='store', 
+                        help="types of features (words, chars, affixes - "
+                        "as per Sapkota et al. 2015 -, as well as lemma or pos, met_line, "
+                        "met_syll (those four last only for TEI files with proper annotation)", 
+                        type=str, default="words", 
+                        choices=["words", "chars", "affixes", "pos", "lemma", "met_line", "met_syll"]
+                        )
+    parser.add_argument('-n', 
+                        action='store', 
+                        help="n grams lengths (default 1)", 
+                        default=1, 
+                        type=int)
+    parser.add_argument('-k', 
+                        action='store', 
+                        help="How many most frequent features?", 
+                        default=5000, 
+                        type=int
+                        )
+    parser.add_argument('--freqs', 
+                        action='store', 
+                        help="relative, absolute or binarised freqs",
                         default="relative",
                         choices=["relative", "absolute", "binary"]
                         )
-    parser.add_argument('-x', action='store', help="format (txt, xml, tei, or txm) WARNING: only txt is fully implemented",
-                        default="txt",
+    parser.add_argument('-x', 
+                        action='store', 
+                        help="format (txt, xml, tei, or txm) WARNING: only txt is fully implemented",
+                        default="txt", 
                         choices=["txt", "xml", "tei", 'txm']
                         )
-    parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False)
-    parser.add_argument('--sample_units', action='store', help="Units of length for sampling "
-                                                               "(words, verses; default: words)",
-                        choices=["words", "verses"],
-                        default="words", type=str)
-    parser.add_argument('--sample_size', action='store', help="Size for sampling (default: 3000)", default=3000, type=int)
-    parser.add_argument('--sample_step', action='store', help="Step for sampling with overlap (default is no overlap)", default=None, type=int)
-    parser.add_argument('--max_samples', action='store', help="Maximum number of (randomly selected) samples per class, e.g. author (default is all)",
-                        default=None, type=int)
-    parser.add_argument('--samples_random', action='store_true',
-                        help="Should random sampling with replacement be performed instead of continuous sampling (default: false)",
-                        default=False)
-    parser.add_argument('--keep_punct', action='store_true', help="whether to keep punctuation and caps (default is False)",
-                        default=False)
-    parser.add_argument('--keep_sym', action='store_true',
-                        help="if true, same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False)",
-                        default=False)
-    parser.add_argument('--no_ascii', action='store_true',
-                        help="disables the conversion to ascii as per the Unidecode module. Useful for non Latin alphabet (default is conversion to ASCII)",
-                        default=False)
-    parser.add_argument('--identify_lang', action='store_true',
-                        help="if true, should the language of each text be guessed, using langdetect (default is False)",
-                        default=False)
-    parser.add_argument('--embedding', action="store", help="optional path to a word2vec embedding in txt format to compute frequencies among a set of semantic neighbourgs (i.e., pseudo-paronyms)",
-                        default=False)
-    parser.add_argument('--neighbouring_size', action="store", help="size of semantic neighbouring in the embedding (n closest neighbours)",
-                        default=10, type=int)
-    parser.add_argument('--culling', action="store",
-                        help="percentage value for culling, meaning in what percentage of samples should a feature be present to be retained (default is 0, meaning no culling)",
-                        default=0, type=float)
-
+    parser.add_argument('--sampling', 
+                        action='store_true', 
+                        help="Sample the texts?", 
+                        default=False
+                        )
+    parser.add_argument('--sample_units', 
+                        action='store', 
+                        help="Units of length for sampling (words, verses; default: words)",
+                        choices=["words", "verses"], 
+                        default="words", 
+                        type=str
+                        )
+    parser.add_argument('--sample_size', 
+                        action='store', 
+                        help="Size for sampling (default: 3000)", 
+                        default=3000, 
+                        type=int
+                        )
+    parser.add_argument('--sample_step', 
+                        action='store', 
+                        help="Step for sampling with overlap (default is no overlap)", 
+                        default=None, 
+                        type=int
+                        )
+    parser.add_argument('--max_samples', 
+                        action='store', 
+                        help="Maximum number of (randomly selected) samples per class, e.g. author (default is all)",
+                        default=None, 
+                        type=int
+                        )
+    parser.add_argument('--samples_random', 
+                        action='store_true',
+                        help="Should random sampling with replacement be performed " \
+                        "instead of continuous sampling (default: false)",
+                        default=False
+                        )
+    parser.add_argument('--keep_punct', 
+                        action='store_true', 
+                        help="whether to keep punctuation and caps (default is False)",
+                        default=False
+                        )
+    parser.add_argument('--keep_sym', 
+                        action='store_true',
+                        help="if true, same as keep_punct, plus no Unidecode, "
+                        "and numbers are kept as well (default is False)",
+                        default=False
+                        )
+    parser.add_argument('--no_ascii', 
+                        action='store_true',
+                        help="disables the conversion to ascii as per the Unidecode module. " \
+                        "Useful for non Latin alphabet (default is conversion to ASCII)",
+                        default=False
+                        )
+    parser.add_argument('--identify_lang', 
+                        action='store_true',
+                        help="if true, should the language of each text be guessed, " \
+                        "using langdetect (default is False)",
+                        default=False
+                        )
+    parser.add_argument('--embedding', 
+                        action="store", 
+                        help="optional path to a word2vec embedding in txt format to compute " \
+                        "frequencies among a set of semantic neighbourgs (i.e., pseudo-paronyms)",
+                        default=False
+                        )
+    parser.add_argument('--neighbouring_size', 
+                        action="store", 
+                        help="size of semantic neighbouring in the embedding (n closest neighbours)",
+                        default=10, 
+                        type=int
+                        )
+    parser.add_argument('--culling', 
+                        action="store",
+                        help="percentage value for culling, meaning in what " \
+                        "percentage of samples should a feature be present " \
+                        "to be retained (default is 0, meaning no culling)",
+                        default=0, 
+                        type=float)
     args = parser.parse_args()
 
+    # Load feature list if provided
+    my_feats = None
     if args.f:
         with open(args.f, 'r') as f:
-            if args.f.split(".")[-1] == "json":
+            if args.f.endswith(".json"):
                 print(".......loading preexisting feature list from json.......")
                 my_feats = json.loads(f.read())
-
-            elif args.f.split(".")[-1] == "txt":
+            elif args.f.endswith(".txt"):
                 print(".......loading preexisting feature list from txt.......")
                 my_feats = [[feat.rstrip(), 0] for feat in f.readlines()]
-
             else:
                 print(".......unknown feature list format. Ignoring.......")
-                my_feats = None
-
-    else:
-        my_feats = None
 
-    if args.json:
-        corpus, my_feats = load_corpus_from_config(args.s)
+    elif args.config:
+        # Load from new-style JSON config file
+        config = Config.from_json(args.config)
+        # Override paths if provided via CLI
+        if args.s:
+            config.corpus.paths = args.s
+
     else:
-        corpus, my_feats = load_corpus(args.s, feat_list=my_feats, feats=args.t, n=args.n, k=args.k,
-                                    freqsType=args.freqs, format=args.x,
-                                    sampling=args.sampling, units=args.sample_units,
-                                    size=args.sample_size, step=args.sample_step, max_samples=args.max_samples,
-                                    samples_random=args.samples_random,
-                                    keep_punct=args.keep_punct, keep_sym=args.keep_sym, no_ascii=args.no_ascii,
-                                    identify_lang=args.identify_lang,
-                                    embedding=args.embedding, neighbouring_size=args.neighbouring_size,
-                                    culling=args.culling
-                                    )
-
-    print(".......saving results.......")
-
+        if not args.s:
+            parser.error("-s (paths) is required when not using --config")
+
+        config = Config.from_kwargs(
+            data_paths=args.s,
+            feats=args.t,
+            n=args.n,
+            k=args.k,
+            freqsType=args.freqs,
+            format=args.x,
+            sampling=args.sampling,
+            units=args.sample_units,
+            size=args.sample_size,
+            step=args.sample_step,
+            max_samples=args.max_samples,
+            samples_random=args.samples_random,
+            keep_punct=args.keep_punct,
+            keep_sym=args.keep_sym,
+            no_ascii=args.no_ascii,
+            identify_lang=args.identify_lang,
+            embedding=args.embedding,
+            neighbouring_size=args.neighbouring_size,
+            culling=args.culling
+        )
+
+    # Inject my_feats if provided
+    if my_feats and config.features:
+        config.features[0].feat_list = my_feats
+
+    # Load corpus
+    corpus, my_feats = load_corpus(config=config)
+
+    # Determine output file names
     if args.o:
         feat_file = args.o + "_feats.json"
         corpus_file = args.o + ".csv"
-
     else:
-        feat_file = "feature_list_{}{}grams{}mf.json".format(args.t, args.n, args.k)
-        corpus_file = "feats_tests_n{}_k_{}.csv".format(args.n, args.k)
+        feat_file = f"feature_list_{args.t}{args.n}grams{args.k}mf.json"
+        corpus_file = f"feats_tests_n{args.n}_k_{args.k}.csv"
 
-    #if not args.f and :
+    # Save results
+    print(".......saving results.......")
+
     with open(feat_file, "w") as out:
         out.write(json.dumps(my_feats, ensure_ascii=False, indent=0))
-        print("Features list saved to " + feat_file)
+        print(f"Features list saved to {feat_file}")
 
+    # Save corpus
     corpus.to_csv(corpus_file)
-    print("Corpus saved to " + corpus_file)
-
-
+    print(f"Corpus saved to {corpus_file}")
diff --git a/split.py b/split.py
@@ -1,3 +1,7 @@
+"""
+Command-line tool for splitting datasets.
+"""
+
 import superstyl.preproc.select as sel
 
 
@@ -12,28 +16,28 @@
                         default=False)
     parser.add_argument('-m', action="store", help="path to metadata file", required=False)
     parser.add_argument('-e', action="store", help="path to excludes file", required=False)
-    parser.add_argument('--lang', action="store", help="analyse only file in this language (optional, for initial split only)", required=False)
-    parser.add_argument('--nosplit', action="store_true", help="no split (do not provide split file)", default=False)
+    parser.add_argument('--lang', action="store", 
+                        help="analyse only file in this language (optional, for initial split only)", 
+                        required=False)
+    parser.add_argument('--nosplit', action="store_true", 
+                        help="no split (do not provide split file)", 
+                        default=False)
+    parser.add_argument('--split_ratio', action="store", type=float,
+                        help="validation split ratio (default: 0.1 = 10%%)",
+                        default=0.1)
     args = parser.parse_args()
 
-    if args.nosplit:
-        sel.read_clean(path=args.path,
-                             metadata_path=args.m,
-                             excludes_path=args.e,
-                             savesplit="split_nosplit.json",
-                             lang=args.lang
-                             )
+    if args.s:
+        # Apply existing selection
+        sel.apply_selection(path=args.path, presplit_path=args.s)
     else:
-
-        if not args.s:
-            # to create initial selection
-            sel.read_clean_split(path=args.path,
-                             metadata_path=args.m,
-                             excludes_path=args.e,
-                             savesplit="split.json",
-                             lang=args.lang
-                             )
-
-        else:
-            # to load and apply a selection
-            sel.apply_selection(path=args.path, presplit_path=args.s)
+        # Create new selection (with or without split)
+        sel.read_clean(
+            path=args.path,
+            metadata_path=args.m,
+            excludes_path=args.e,
+            savesplit="split_nosplit.json" if args.nosplit else "split.json",
+            lang=args.lang,
+            split=not args.nosplit,
+            split_ratio=args.split_ratio
+        )