SupervisedStylometry
diff --git a/‎load_corpus.py‎
Lines changed: 20 additions & 12 deletions b/‎load_corpus.py‎
Lines changed: 20 additions & 12 deletions
diff --git a/‎superstyl/load.py‎
Lines changed: 3 additions & 2 deletions b/‎superstyl/load.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎superstyl/load_from_config.py‎
Lines changed: 173 additions & 0 deletions b/‎superstyl/load_from_config.py‎
Lines changed: 173 additions & 0 deletions
diff --git a/‎superstyl/preproc/features_extract.py‎
Lines changed: 11 additions & 5 deletions b/‎superstyl/preproc/features_extract.py‎
Lines changed: 11 additions & 5 deletions
@@ -1,17 +1,22 @@
 from superstyl.load import load_corpus
+from superstyl.load_from_config import load_corpus_from_config
 import json
 
 # TODO: eliminate features that occur only n times ?
 # Do the Moisl Selection ?
+# TODO: document the new 'lemma' feat for TEI loading
 
 if __name__ == '__main__':
 
     import argparse
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('-s', nargs='+', help="paths to files", required=True)
+    parser.add_argument('-s', nargs='+', help="paths to files or to json config file", required=True)
+    parser.add_argument('--json', action='store_true', help="indicates that the path provided with -s is a JSON config file, "
+    "containing all the options to load the corpus/features")
     parser.add_argument('-o', action='store', help="optional base name of output files", type=str, default=False)
-    parser.add_argument('-f', action="store", help="optional list of features, either in json (generated by Superstyl) or simple txt (one word per line)", default=False)
+    parser.add_argument('-f', action="store", help="optional list of features, either in json (generated by"
+                                                   " Superstyl) or simple txt (one word per line)", default=False)
     parser.add_argument('-t', action='store', help="types of features (words, chars, affixes - "
                                                    "as per Sapkota et al. 2015 - or pos). pos are currently"
                                                    "only implemented for Modern English", type=str,
@@ -76,16 +81,19 @@
     else:
         my_feats = None
 
-    corpus, my_feats = load_corpus(args.s, feat_list=my_feats, feats=args.t, n=args.n, k=args.k,
-                                   freqsType=args.freqs, format=args.x,
-                                   sampling=args.sampling, units=args.sample_units,
-                                   size=args.sample_size, step=args.sample_step, max_samples=args.max_samples,
-                                   samples_random=args.samples_random,
-                                   keep_punct=args.keep_punct, keep_sym=args.keep_sym, no_ascii=args.no_ascii,
-                                   identify_lang=args.identify_lang,
-                                   embedding=args.embedding, neighbouring_size=args.neighbouring_size,
-                                   culling=args.culling
-                                   )
+    if args.json:
+        corpus, my_feats = load_corpus_from_config(args.s)
+    else:
+        corpus, my_feats = load_corpus(args.s, feat_list=my_feats, feats=args.t, n=args.n, k=args.k,
+                                    freqsType=args.freqs, format=args.x,
+                                    sampling=args.sampling, units=args.sample_units,
+                                    size=args.sample_size, step=args.sample_step, max_samples=args.max_samples,
+                                    samples_random=args.samples_random,
+                                    keep_punct=args.keep_punct, keep_sym=args.keep_sym, no_ascii=args.no_ascii,
+                                    identify_lang=args.identify_lang,
+                                    embedding=args.embedding, neighbouring_size=args.neighbouring_size,
+                                    culling=args.culling
+                                    )
 
     print(".......saving results.......")
 
 
@@ -17,6 +17,7 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp
     :param feats: the type of features, one of 'words', 'chars', 'affixes, and 'POS'. Affixes are inspired by
     Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and, if keep_pos, punctuation n-grams.
     POS are currently only implemented for Modern English
+    TODO: add met_line, met_syll
     :param n: n grams lengths (default 1)
     :param k: How many most frequent? The function takes the rank of k (if k is smaller than the total number of features),
     gets its frequencies, and only include features of superior or equal total frequencies.
@@ -55,13 +56,13 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp
     print(".......loading texts.......")
 
     if sampling:
-        myTexts = pipe.docs_to_samples(data_paths, format=format, units=units, size=size, step=step,
+        myTexts = pipe.docs_to_samples(data_paths, feats=feats, format=format, units=units, size=size, step=step,
                                        max_samples=max_samples, samples_random=samples_random,
                                        keep_punct=keep_punct, keep_sym=keep_sym, no_ascii=no_ascii,
                                        identify_lang = identify_lang)
 
     else:
-        myTexts = pipe.load_texts(data_paths, format=format, max_samples=max_samples, keep_punct=keep_punct,
+        myTexts = pipe.load_texts(data_paths, feats=feats, format=format, max_samples=max_samples, keep_punct=keep_punct,
                                   keep_sym=keep_sym, no_ascii=no_ascii, identify_lang=identify_lang)
 
     print(".......getting features.......")
 
@@ -0,0 +1,173 @@
+import json
+import superstyl
+import pandas as pd
+import os
+
+from superstyl.load import load_corpus
+
+def load_corpus_from_config(config_path):
+    """
+    Load a corpus based on a JSON configuration file.
+    
+    Parameters:
+    -----------
+    config_path : str
+        Path to the JSON configuration file
+        
+    Returns:
+    --------
+    tuple: (corpus, feat_list) - Same format as load_corpus function
+           If multiple features are defined, returns the merged corpus and the combined feature list
+           If only one feature is defined, returns that corpus and its feature list
+    """
+    # Load configuration
+    if not config_path.endswith('.json'):
+        raise ValueError(f"Unsupported configuration file format: {config_path}. Only JSON format is supported.")
+    
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    
+    # Get corpus paths
+    if 'paths' in config:
+        if isinstance(config['paths'], list):
+            paths = config['paths']
+        elif isinstance(config['paths'], str):
+            paths = [config['paths']]
+        else:
+            raise ValueError("Paths in config must be either a list or a glob pattern string")
+    else:
+        raise ValueError("No paths provided and no paths found in config")
+    
+    # Get sampling parameters
+    sampling_params = config.get('sampling', {})
+    
+    # Use the first feature to create the base corpus with sampling
+    feature_configs = config.get('features', [])
+    if not feature_configs:
+        raise ValueError("No features specified in the configuration")
+    
+    # If there's only one feature, we can simply return the result of load_corpus
+    if len(feature_configs) == 1:
+        feature_config = feature_configs[0]
+        feature_name = feature_config.get('name', "f1")
+        
+        # Check for feature list file
+        feat_list = None
+        feat_list_path = feature_config.get('feat_list')
+        if feat_list_path:
+            if feat_list_path.endswith('.json'):
+                with open(feat_list_path, 'r') as f:
+                    feat_list = json.load(f)
+            elif feat_list_path.endswith('.txt'):
+                with open(feat_list_path, 'r') as f:
+                    feat_list = [[feat.strip(), 0] for feat in f.readlines()]
+        
+        # Set up other parameters
+        params = {
+            'feats': feature_config.get('type', 'words'),
+            'n': feature_config.get('n', 1),
+            'k': feature_config.get('k', 5000),
+            'freqsType': feature_config.get('freq_type', 'relative'),
+            'format': config.get('format', 'txt'),
+            'sampling': sampling_params.get('enabled', False),
+            'units': sampling_params.get('units', 'words'),
+            'size': sampling_params.get('sample_size', 3000),
+            'step': sampling_params.get('sample_step', None),
+            'max_samples': sampling_params.get('max_samples', None),
+            'samples_random': sampling_params.get('sample_random', False),
+            'keep_punct': feature_config.get('keep_punct', False),
+            'keep_sym': feature_config.get('keep_sym', False),
+            'no_ascii': feature_config.get('no_ascii', False),
+            'identify_lang': feature_config.get('identify_lang', False),
+            'embedding': feature_config.get('embedding', None),
+            'neighbouring_size': feature_config.get('neighbouring_size', 10),
+            'culling': feature_config.get('culling', 0)
+        }
+        
+        print(f"Loading corpus with {feature_name}...")
+        corpus, features = load_corpus(paths, feat_list=feat_list, **params)
+        
+        return corpus, features
+    
+    # For multiple features, we need to process each one and merge the results
+    corpora = {}
+    feature_lists = {}
+    
+    # Process each feature configuration
+    for i, feature_config in enumerate(feature_configs):
+        feature_name = feature_config.get('name', f"f{i+1}")
+
+        # Check for feature list file
+        feat_list = None
+        feat_list_path = feature_config.get('feat_list')
+        if feat_list_path:
+            if feat_list_path.endswith('.json'):
+                with open(feat_list_path, 'r') as f:
+                    feat_list = json.load(f)
+            elif feat_list_path.endswith('.txt'):
+                with open(feat_list_path, 'r') as f:
+                    feat_list = [[feat.strip(), 0] for feat in f.readlines()]
+        
+        # Set up other parameters
+        params = {
+            'feats': feature_config.get('type', 'words'),
+            'n': feature_config.get('n', 1),
+            'k': feature_config.get('k', 5000),
+            'freqsType': feature_config.get('freq_type', 'relative'),
+            'format': config.get('format', 'txt'),
+            'sampling': sampling_params.get('enabled', False),
+            'units': sampling_params.get('units', 'words'),
+            'size': sampling_params.get('sample_size', 3000),
+            'step': sampling_params.get('sample_step', None),
+            'max_samples': sampling_params.get('max_samples', None),
+            'samples_random': sampling_params.get('sample_random', False),
+            'keep_punct': config.get('keep_punct', False),
+            'keep_sym': config.get('keep_sym', False),
+            'no_ascii': config.get('no_ascii', False),
+            'identify_lang': config.get('identify_lang', False),
+            'embedding': feature_config.get('embedding', None),
+            'neighbouring_size': feature_config.get('neighbouring_size', 10),
+            'culling': feature_config.get('culling', 0)
+        }
+        
+        print(f"Loading {feature_name}...")
+        corpus, features = load_corpus(paths, feat_list=feat_list, **params)
+        
+        # Store corpus and features
+        corpora[feature_name] = corpus
+        feature_lists[feature_name] = features
+    
+    # Create a merged dataset
+    print("Creating merged dataset...")
+    first_corpus_name = next(iter(corpora))
+    
+    # Start with metadata from the first corpus
+    metadata = corpora[first_corpus_name][['author', 'lang']]
+    
+    # Create an empty DataFrame for the merged corpus
+    merged = pd.DataFrame(index=metadata.index)
+    
+    # Add metadata
+    merged = pd.concat([metadata, merged], axis=1)
+    
+    # Combine all features with prefixes to avoid name collisions
+    all_features = []
+    
+    # Add features from each corpus
+    for name, corpus in corpora.items():
+        feature_cols = [col for col in corpus.columns if col not in ['author', 'lang']]
+        
+        # Rename columns to avoid duplicates
+        renamed_cols = {col: f"{name}_{col}" for col in feature_cols}
+        feature_df = corpus[feature_cols].rename(columns=renamed_cols)
+        
+        # Merge with the main DataFrame
+        merged = pd.concat([merged, feature_df], axis=1)
+        
+        # Add features to the combined list with prefixes
+        for feature in feature_lists[name]:
+            all_features.append((f"{name}_{feature[0]}", feature[1]))
+    
+    # Return the merged corpus and combined feature list
+    return merged, all_features
+
@@ -20,15 +20,15 @@ def count_features(text, feats ="words", n = 1):
         raise ValueError("Text cannot be empty.")
     if n < 1 or not isinstance(n, int):
         raise ValueError("n must be a positive integer.")
-    if feats not in ["words", "chars", "affixes", "pos"]:
-        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', or 'pos'.")
+    if feats not in ["words", "chars", "affixes", "pos", "met_line", "met_syll"]:
+        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', or 'pos'.")
     if feats == "words":
         tokens = nltk.tokenize.wordpunct_tokenize(text)
         if n > 1:
             tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))]
         total = len(tokens)
 
-    elif feats == "chars":
+    elif feats in ["chars", "met_syll"]:
         tokens = [re.sub(r'\p{Z}', '_', ''.join(ngram)) for ngram in nltk.ngrams(text, n)]
         total = len(tokens)
 
@@ -60,14 +60,20 @@ def count_features(text, feats ="words", n = 1):
             tokens = pos_tags
         total = len(tokens)
 
+    elif feats == "met_line":
+        tokens = text.split()
+        if n > 1:
+            tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))]
+        total = len(tokens)
+
     # Adding sentence length ; still commented as it is a work in progress, an integer won't do, a quantile would be better
     #elif feats == "sentenceLength":
     #    sentences = nltk.tokenize.sent_tokenize(text)
     #    tokens = [str(len(nltk.tokenize.word_tokenize(sentence))) for sentence in sentences]
 
     #Adding an error message in case some distracted guy like me would enter something wrong:
     else:
-        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes' or 'pos'.")
+        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll' or 'pos'.")
 
     counts = Counter()
     counts.update(tokens)
@@ -102,7 +108,7 @@ def get_feature_list(myTexts, feats="words", n=1, freqsType="relative"):
     """
     :param myTexts: a 'myTexts' object, containing documents to be processed
     :param feat_list: a list of features to be selected
-    :param feats: type of feats (words, chars, affixes or POS)
+    :param feats: type of feats (words, chars, affixes, POS, met_line, or met_syll)
     :param freqsType: "relative", "absolute" or "binary" frequencies
     :param n: n-grams length
     :return: list of features, with total frequency