Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .coverage
Binary file not shown.
264 changes: 183 additions & 81 deletions load_corpus.py
Original file line number Diff line number Diff line change
@@ -1,117 +1,219 @@
from superstyl.load import load_corpus
from superstyl.load_from_config import load_corpus_from_config
from superstyl.config import Config
import json

# TODO: eliminate features that occur only n times ?
# Do the Moisl Selection ?
# TODO: document the new 'lemma' feat for TEI loading

if __name__ == '__main__':

import argparse

parser = argparse.ArgumentParser()
parser.add_argument('-s', nargs='+', help="paths to files or to json config file", required=True)
parser.add_argument('--json', action='store_true', help="indicates that the path provided with -s is a JSON config file, "
"containing all the options to load the corpus/features")
parser.add_argument('-o', action='store', help="optional base name of output files", type=str, default=False)
parser.add_argument('-f', action="store", help="optional list of features, either in json (generated by"
" Superstyl) or simple txt (one word per line)", default=False)
parser.add_argument('-t', action='store', help="types of features (words, chars, affixes - "
"as per Sapkota et al. 2015 -, as well as lemma or pos, met_line, "
"met_syll (those four last only for TEI files with proper annotation)"
, type=str,
default="words", choices=["words", "chars", "affixes", "pos", "lemma", "met_line", "met_syll"])
parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int)
parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
parser.add_argument('--freqs', action='store', help="relative, absolute or binarised freqs",
parser = argparse.ArgumentParser(
description="Load a corpus and extract features for stylometric analysis."
)
parser.add_argument('-s',
nargs='+',
help="paths to files or to json config file",
required=True
)
parser.add_argument('--json',
action='store_true',
help="indicates that the path provided with -s is a JSON config file, "
"containing all the options to load the corpus/features"
)
parser.add_argument('-o',
action='store',
help="optional base name of output files",
type=str,
default=False
)
# Feature list
parser.add_argument('-f',
action="store",
help="optional list of features, either in json (generated by"
" Superstyl) or simple txt (one word per line)",
default=False
)
parser.add_argument('-t',
action='store',
help="types of features (words, chars, affixes - "
"as per Sapkota et al. 2015 -, as well as lemma or pos, met_line, "
"met_syll (those four last only for TEI files with proper annotation)",
type=str, default="words",
choices=["words", "chars", "affixes", "pos", "lemma", "met_line", "met_syll"]
)
parser.add_argument('-n',
action='store',
help="n grams lengths (default 1)",
default=1,
type=int)
parser.add_argument('-k',
action='store',
help="How many most frequent features?",
default=5000,
type=int
)
parser.add_argument('--freqs',
action='store',
help="relative, absolute or binarised freqs",
default="relative",
choices=["relative", "absolute", "binary"]
)
parser.add_argument('-x', action='store', help="format (txt, xml, tei, or txm) WARNING: only txt is fully implemented",
default="txt",
parser.add_argument('-x',
action='store',
help="format (txt, xml, tei, or txm) WARNING: only txt is fully implemented",
default="txt",
choices=["txt", "xml", "tei", 'txm']
)
parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False)
parser.add_argument('--sample_units', action='store', help="Units of length for sampling "
"(words, verses; default: words)",
choices=["words", "verses"],
default="words", type=str)
parser.add_argument('--sample_size', action='store', help="Size for sampling (default: 3000)", default=3000, type=int)
parser.add_argument('--sample_step', action='store', help="Step for sampling with overlap (default is no overlap)", default=None, type=int)
parser.add_argument('--max_samples', action='store', help="Maximum number of (randomly selected) samples per class, e.g. author (default is all)",
default=None, type=int)
parser.add_argument('--samples_random', action='store_true',
help="Should random sampling with replacement be performed instead of continuous sampling (default: false)",
default=False)
parser.add_argument('--keep_punct', action='store_true', help="whether to keep punctuation and caps (default is False)",
default=False)
parser.add_argument('--keep_sym', action='store_true',
help="if true, same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False)",
default=False)
parser.add_argument('--no_ascii', action='store_true',
help="disables the conversion to ascii as per the Unidecode module. Useful for non Latin alphabet (default is conversion to ASCII)",
default=False)
parser.add_argument('--identify_lang', action='store_true',
help="if true, should the language of each text be guessed, using langdetect (default is False)",
default=False)
parser.add_argument('--embedding', action="store", help="optional path to a word2vec embedding in txt format to compute frequencies among a set of semantic neighbourgs (i.e., pseudo-paronyms)",
default=False)
parser.add_argument('--neighbouring_size', action="store", help="size of semantic neighbouring in the embedding (n closest neighbours)",
default=10, type=int)
parser.add_argument('--culling', action="store",
help="percentage value for culling, meaning in what percentage of samples should a feature be present to be retained (default is 0, meaning no culling)",
default=0, type=float)

parser.add_argument('--sampling',
action='store_true',
help="Sample the texts?",
default=False
)
parser.add_argument('--sample_units',
action='store',
help="Units of length for sampling (words, verses; default: words)",
choices=["words", "verses"],
default="words",
type=str
)
parser.add_argument('--sample_size',
action='store',
help="Size for sampling (default: 3000)",
default=3000,
type=int
)
parser.add_argument('--sample_step',
action='store',
help="Step for sampling with overlap (default is no overlap)",
default=None,
type=int
)
parser.add_argument('--max_samples',
action='store',
help="Maximum number of (randomly selected) samples per class, e.g. author (default is all)",
default=None,
type=int
)
parser.add_argument('--samples_random',
action='store_true',
help="Should random sampling with replacement be performed " \
"instead of continuous sampling (default: false)",
default=False
)
parser.add_argument('--keep_punct',
action='store_true',
help="whether to keep punctuation and caps (default is False)",
default=False
)
parser.add_argument('--keep_sym',
action='store_true',
help="if true, same as keep_punct, plus no Unidecode, "
"and numbers are kept as well (default is False)",
default=False
)
parser.add_argument('--no_ascii',
action='store_true',
help="disables the conversion to ascii as per the Unidecode module. " \
"Useful for non Latin alphabet (default is conversion to ASCII)",
default=False
)
parser.add_argument('--identify_lang',
action='store_true',
help="if true, should the language of each text be guessed, " \
"using langdetect (default is False)",
default=False
)
parser.add_argument('--embedding',
action="store",
help="optional path to a word2vec embedding in txt format to compute " \
"frequencies among a set of semantic neighbourgs (i.e., pseudo-paronyms)",
default=False
)
parser.add_argument('--neighbouring_size',
action="store",
help="size of semantic neighbouring in the embedding (n closest neighbours)",
default=10,
type=int
)
parser.add_argument('--culling',
action="store",
help="percentage value for culling, meaning in what " \
"percentage of samples should a feature be present " \
"to be retained (default is 0, meaning no culling)",
default=0,
type=float)
args = parser.parse_args()

# Load feature list if provided
my_feats = None
if args.f:
with open(args.f, 'r') as f:
if args.f.split(".")[-1] == "json":
if args.f.endswith(".json"):
print(".......loading preexisting feature list from json.......")
my_feats = json.loads(f.read())

elif args.f.split(".")[-1] == "txt":
elif args.f.endswith(".txt"):
print(".......loading preexisting feature list from txt.......")
my_feats = [[feat.rstrip(), 0] for feat in f.readlines()]

else:
print(".......unknown feature list format. Ignoring.......")
my_feats = None

else:
my_feats = None

if args.json:
corpus, my_feats = load_corpus_from_config(args.s)
elif args.config:
# Load from new-style JSON config file
config = Config.from_json(args.config)
# Override paths if provided via CLI
if args.s:
config.corpus.paths = args.s

else:
corpus, my_feats = load_corpus(args.s, feat_list=my_feats, feats=args.t, n=args.n, k=args.k,
freqsType=args.freqs, format=args.x,
sampling=args.sampling, units=args.sample_units,
size=args.sample_size, step=args.sample_step, max_samples=args.max_samples,
samples_random=args.samples_random,
keep_punct=args.keep_punct, keep_sym=args.keep_sym, no_ascii=args.no_ascii,
identify_lang=args.identify_lang,
embedding=args.embedding, neighbouring_size=args.neighbouring_size,
culling=args.culling
)

print(".......saving results.......")

if not args.s:
parser.error("-s (paths) is required when not using --config")

config = Config.from_kwargs(
data_paths=args.s,
feats=args.t,
n=args.n,
k=args.k,
freqsType=args.freqs,
format=args.x,
sampling=args.sampling,
units=args.sample_units,
size=args.sample_size,
step=args.sample_step,
max_samples=args.max_samples,
samples_random=args.samples_random,
keep_punct=args.keep_punct,
keep_sym=args.keep_sym,
no_ascii=args.no_ascii,
identify_lang=args.identify_lang,
embedding=args.embedding,
neighbouring_size=args.neighbouring_size,
culling=args.culling
)

# Inject my_feats if provided
if my_feats and config.features:
config.features[0].feat_list = my_feats

# Load corpus
corpus, my_feats = load_corpus(config=config)

# Determine output file names
if args.o:
feat_file = args.o + "_feats.json"
corpus_file = args.o + ".csv"

else:
feat_file = "feature_list_{}{}grams{}mf.json".format(args.t, args.n, args.k)
corpus_file = "feats_tests_n{}_k_{}.csv".format(args.n, args.k)
feat_file = f"feature_list_{args.t}{args.n}grams{args.k}mf.json"
corpus_file = f"feats_tests_n{args.n}_k_{args.k}.csv"

#if not args.f and :
# Save results
print(".......saving results.......")

with open(feat_file, "w") as out:
out.write(json.dumps(my_feats, ensure_ascii=False, indent=0))
print("Features list saved to " + feat_file)
print(f"Features list saved to {feat_file}")

# Save corpus
corpus.to_csv(corpus_file)
print("Corpus saved to " + corpus_file)


print(f"Corpus saved to {corpus_file}")
48 changes: 26 additions & 22 deletions split.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
"""
Command-line tool for splitting datasets.
"""

import superstyl.preproc.select as sel


Expand All @@ -12,28 +16,28 @@
default=False)
parser.add_argument('-m', action="store", help="path to metadata file", required=False)
parser.add_argument('-e', action="store", help="path to excludes file", required=False)
parser.add_argument('--lang', action="store", help="analyse only file in this language (optional, for initial split only)", required=False)
parser.add_argument('--nosplit', action="store_true", help="no split (do not provide split file)", default=False)
parser.add_argument('--lang', action="store",
help="analyse only file in this language (optional, for initial split only)",
required=False)
parser.add_argument('--nosplit', action="store_true",
help="no split (do not provide split file)",
default=False)
parser.add_argument('--split_ratio', action="store", type=float,
help="validation split ratio (default: 0.1 = 10%%)",
default=0.1)
args = parser.parse_args()

if args.nosplit:
sel.read_clean(path=args.path,
metadata_path=args.m,
excludes_path=args.e,
savesplit="split_nosplit.json",
lang=args.lang
)
if args.s:
# Apply existing selection
sel.apply_selection(path=args.path, presplit_path=args.s)
else:

if not args.s:
# to create initial selection
sel.read_clean_split(path=args.path,
metadata_path=args.m,
excludes_path=args.e,
savesplit="split.json",
lang=args.lang
)

else:
# to load and apply a selection
sel.apply_selection(path=args.path, presplit_path=args.s)
# Create new selection (with or without split)
sel.read_clean(
path=args.path,
metadata_path=args.m,
excludes_path=args.e,
savesplit="split_nosplit.json" if args.nosplit else "split.json",
lang=args.lang,
split=not args.nosplit,
split_ratio=args.split_ratio
)
Loading