diff --git a/.gitignore b/.gitignore
index 8ad09ebf..f6da5bc9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
# Self-defined
*result/
+*benchmark_result.csv
slct
# Byte-compiled / optimized / DLL files
diff --git a/README.md b/README.md
index cf6dfb59..bbf93fd7 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ All the log parsers have been evaluated across 16 different logs available in [l
- :point_down: Check the detailed bechmarking result table (click to expand)
+ :point_down: Check the detailed benchmarking result table (click to expand)
diff --git a/benchmark/AEL_benchmark.py b/benchmark/AEL_benchmark.py
deleted file mode 100644
index 6064f262..00000000
--- a/benchmark/AEL_benchmark.py
+++ /dev/null
@@ -1,164 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-sys.path.append('../')
-from logparser import AEL, evaluator
-import os
-import pandas as pd
-
-
-input_dir = '../logs/' # The input directory of log file
-output_dir = 'AEL_result/' # The output directory of parsing results
-
-benchmark_settings = {
- 'HDFS': {
- 'log_file': 'HDFS/HDFS_2k.log',
- 'log_format': ' : ',
- 'regex': [r'blk_-?\d+', r'(\d+\.){3}\d+(:\d+)?'],
- 'minEventCount': 2,
- 'merge_percent' : 0.5
- },
-
- 'Hadoop': {
- 'log_file': 'Hadoop/Hadoop_2k.log',
- 'log_format': ' \[\] : ',
- 'regex': [r'(\d+\.){3}\d+'],
- 'minEventCount': 2,
- 'merge_percent' : 0.4
- },
-
- 'Spark': {
- 'log_file': 'Spark/Spark_2k.log',
- 'log_format': ' : ',
- 'regex': [r'(\d+\.){3}\d+', r'\b[KGTM]?B\b', r'([\w-]+\.){2,}[\w-]+'],
- 'minEventCount': 2,
- 'merge_percent' : 0.4
- },
-
- 'Zookeeper': {
- 'log_file': 'Zookeeper/Zookeeper_2k.log',
- 'log_format': ' - \[:@\] - ',
- 'regex': [r'(/|)(\d+\.){3}\d+(:\d+)?'],
- 'minEventCount': 2,
- 'merge_percent' : 0.4
- },
-
- 'BGL': {
- 'log_file': 'BGL/BGL_2k.log',
- 'log_format': ' ',
- 'regex': [r'core\.\d+'],
- 'minEventCount': 2,
- 'merge_percent' : 0.5
- },
-
- 'HPC': {
- 'log_file': 'HPC/HPC_2k.log',
- 'log_format': ' ',
- 'regex': [r'=\d+'],
- 'minEventCount': 5,
- 'merge_percent' : 0.4
- },
-
- 'Thunderbird': {
- 'log_file': 'Thunderbird/Thunderbird_2k.log',
- 'log_format': ' (\[\])?: ',
- 'regex': [r'(\d+\.){3}\d+'],
- 'minEventCount': 2,
- 'merge_percent' : 0.4
- },
-
- 'Windows': {
- 'log_file': 'Windows/Windows_2k.log',
- 'log_format': ' , ',
- 'regex': [r'0x.*?\s'],
- 'minEventCount': 2,
- 'merge_percent' : 0.4
- },
-
- 'Linux': {
- 'log_file': 'Linux/Linux_2k.log',
- 'log_format': ' (\[\])?: ',
- 'regex': [r'(\d+\.){3}\d+', r'\d{2}:\d{2}:\d{2}'],
- 'minEventCount': 2,
- 'merge_percent' : 0.6
- },
-
- 'Andriod': {
- 'log_file': 'Andriod/Andriod_2k.log',
- 'log_format': ' : ',
- 'regex': [r'(/[\w-]+)+', r'([\w-]+\.){2,}[\w-]+', r'\b(\-?\+?\d+)\b|\b0[Xx][a-fA-F\d]+\b|\b[a-fA-F\d]{4,}\b'],
- 'minEventCount': 2,
- 'merge_percent' : 0.6
- },
-
- 'HealthApp': {
- 'log_file': 'HealthApp/HealthApp_2k.log',
- 'log_format': '\|\|\|',
- 'regex': [],
- 'minEventCount': 2,
- 'merge_percent' : 0.6
- },
-
- 'Apache': {
- 'log_file': 'Apache/Apache_2k.log',
- 'log_format': '\[\] \[\] ',
- 'regex': [r'(\d+\.){3}\d+'],
- 'minEventCount': 2,
- 'merge_percent' : 0.4
- },
-
- 'Proxifier': {
- 'log_file': 'Proxifier/Proxifier_2k.log',
- 'log_format': '\[\] - ',
- 'regex': [r'<\d+\s?sec', r'([\w-]+\.)+[\w-]+(:\d+)?', r'\d{2}:\d{2}(:\d{2})*', r'[KGTM]B'],
- 'minEventCount': 2,
- 'merge_percent' : 0.4
- },
-
- 'OpenSSH': {
- 'log_file': 'OpenSSH/OpenSSH_2k.log',
- 'log_format': ' sshd\[\]: ',
- 'regex': [r'(\d+\.){3}\d+', r'([\w-]+\.){2,}[\w-]+'],
- 'minEventCount': 10,
- 'merge_percent' : 0.7
- },
-
- 'OpenStack': {
- 'log_file': 'OpenStack/OpenStack_2k.log',
- 'log_format': ' \[\] ',
- 'regex': [r'((\d+\.){3}\d+,?)+', r'/.+?\s', r'\d+'],
- 'minEventCount': 6,
- 'merge_percent' : 0.5
- },
-
- 'Mac': {
- 'log_file': 'Mac/Mac_2k.log',
- 'log_format': ' \[\]( \(\))?: ',
- 'regex': [r'([\w-]+\.){2,}[\w-]+'],
- 'minEventCount': 2,
- 'merge_percent' : 0.6
- }
-}
-
-bechmark_result = []
-for dataset, setting in benchmark_settings.iteritems():
- print('\n=== Evaluation on %s ==='%dataset)
- indir = os.path.join(input_dir, os.path.dirname(setting['log_file']))
- log_file = os.path.basename(setting['log_file'])
-
- parser = AEL.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir,
- minEventCount=setting['minEventCount'], merge_percent=setting['merge_percent'], rex=setting['regex'])
- parser.parse(log_file)
-
- F1_measure, accuracy = evaluator.evaluate(
- groundtruth=os.path.join(indir, log_file + '_structured.csv'),
- parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
- )
- bechmark_result.append([dataset, F1_measure, accuracy])
-
-
-print('\n=== Overall evaluation results ===')
-df_result = pd.DataFrame(bechmark_result, columns=['Dataset', 'F1_measure', 'Accuracy'])
-df_result.set_index('Dataset', inplace=True)
-print(df_result)
-df_result.T.to_csv('AEL_bechmark_result.csv')
diff --git a/benchmark/Benchmark.py b/benchmark/Benchmark.py
new file mode 100644
index 00000000..19bb88f1
--- /dev/null
+++ b/benchmark/Benchmark.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+
+import sys
+sys.path.append('../')
+from logparser import evaluator, AEL, Drain, IPLoM, LenMa, LFA, LKE, LogCluster, LogMine, LogSig, MoLFI, SHISO, SLCT, Spell
+from LogSettings import benchmark_settings, input_dir
+import pandas as pd
+import os
+
+
+# How to construct a parser for specific parameters
+parsers = {
+ "AEL": lambda setting: AEL.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir,
+ minEventCount=setting['minEventCount'], merge_percent=setting['merge_percent'], rex=setting['regex']),
+ "Drain": lambda setting: Drain.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir, rex=setting['regex'], depth=setting['depth'], st=setting['st']),
+ "IPLoM": lambda setting: IPLoM.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir,
+ CT=setting['CT'], lowerBound=setting['lowerBound'], rex=setting['regex']),
+ "LenMa": lambda setting: LenMa.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir, rex=setting['regex'], threshold=setting['threshold']),
+ "LFA": lambda setting: LFA.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir, rex=setting['regex']),
+ "LKE": lambda setting: LKE.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir, rex=setting['regex'],
+ split_threshold=setting['split_threshold']),
+ "LogCluster": lambda setting: LogCluster.LogParser(indir, setting['log_format'], output_dir, rex=setting['regex'], rsupport=setting['rsupport']),
+ "LogMine": lambda setting: LogMine.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir,
+ rex=setting['regex'], max_dist=setting['max_dist'], k=setting['k'],
+ levels=setting['levels']),
+ "LogSig": lambda setting: LogSig.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir, rex=setting['regex'], groupNum=setting['groupNum']),
+ "MoLFI": lambda setting: MoLFI.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir, rex=setting['regex']),
+ "SHISO": lambda setting: SHISO.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir, rex=setting['regex'],
+ maxChildNum=setting['maxChildNum'], mergeThreshold=setting['mergeThreshold'],
+ formatLookupThreshold=setting['formatLookupThreshold'], superFormatThreshold=setting['superFormatThreshold']),
+ "SLCT": lambda setting: SLCT.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir,
+ rex=setting['regex'], support=setting['support']),
+ "Spell": lambda setting: Spell.LogParser(log_format=setting['log_format'], indir=indir,
+ outdir=output_dir, rex=setting['regex'], tau=setting['tau'])
+}
+
+bm_parsers = []
+for arg in sys.argv:
+ if arg in parsers:
+ bm_parsers.append(arg)
+if bm_parsers == []:
+ bm_parsers = parsers.keys()
+
+bm_datasets = []
+for arg in sys.argv:
+ if arg in benchmark_settings.keys():
+ bm_datasets.append(arg)
+if bm_datasets == []:
+ bm_datasets = benchmark_settings.keys()
+
+print("\n== Benchmarking " + ', '.join(bm_parsers) + " on " + ', '.join(bm_datasets) + " ==\n")
+
+for bm_parser_name in bm_parsers:
+ bm_parser = parsers[bm_parser_name]
+ # The output directory of parsing results
+ output_dir = bm_parser_name + '_result/'
+ benchmark_result = []
+ for dataset, setting in benchmark_settings.items():
+ if not (dataset in bm_datasets):
+ continue
+ print('\n=== Evaluation of %s on %s ===' % (bm_parser_name, dataset))
+ indir = os.path.join(input_dir, os.path.dirname(setting['log_file']))
+ log_file = os.path.basename(setting['log_file'])
+ bm_parser(setting).parse(log_file)
+
+ F1_measure, accuracy = evaluator.evaluate(
+ groundtruth=os.path.join(indir, log_file + '_structured.csv'),
+ parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
+ )
+ benchmark_result.append([dataset, F1_measure, accuracy])
+
+ print('\n=== Overall evaluation results ===')
+ df_result = pd.DataFrame(benchmark_result, columns=[
+ 'Dataset', 'F1_measure', 'Accuracy'])
+ df_result.set_index('Dataset', inplace=True)
+ print(df_result)
+ df_result.T.to_csv(bm_parser_name + '_benchmark_result.csv')
diff --git a/benchmark/Drain_benchmark.py b/benchmark/Drain_benchmark.py
deleted file mode 100644
index d42e1118..00000000
--- a/benchmark/Drain_benchmark.py
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-sys.path.append('../')
-from logparser import Drain, evaluator
-import os
-import pandas as pd
-
-
-input_dir = '../logs/' # The input directory of log file
-output_dir = 'Drain_result/' # The output directory of parsing results
-
-benchmark_settings = {
- 'HDFS': {
- 'log_file': 'HDFS/HDFS_2k.log',
- 'log_format': ' : ',
- 'regex': [r'blk_-?\d+', r'(\d+\.){3}\d+(:\d+)?'],
- 'st': 0.5,
- 'depth': 4
- },
-
- 'Hadoop': {
- 'log_file': 'Hadoop/Hadoop_2k.log',
- 'log_format': ' \[\] : ',
- 'regex': [r'(\d+\.){3}\d+'],
- 'st': 0.5,
- 'depth': 4
- },
-
- 'Spark': {
- 'log_file': 'Spark/Spark_2k.log',
- 'log_format': ' : ',
- 'regex': [r'(\d+\.){3}\d+', r'\b[KGTM]?B\b', r'([\w-]+\.){2,}[\w-]+'],
- 'st': 0.5,
- 'depth': 4
- },
-
- 'Zookeeper': {
- 'log_file': 'Zookeeper/Zookeeper_2k.log',
- 'log_format': ' - \[:@\] - ',
- 'regex': [r'(/|)(\d+\.){3}\d+(:\d+)?'],
- 'st': 0.5,
- 'depth': 4
- },
-
- 'BGL': {
- 'log_file': 'BGL/BGL_2k.log',
- 'log_format': ' ',
- 'regex': [r'core\.\d+'],
- 'st': 0.5,
- 'depth': 4
- },
-
- 'HPC': {
- 'log_file': 'HPC/HPC_2k.log',
- 'log_format': ' ',
- 'regex': [r'=\d+'],
- 'st': 0.5,
- 'depth': 4
- },
-
- 'Thunderbird': {
- 'log_file': 'Thunderbird/Thunderbird_2k.log',
- 'log_format': ' (\[\])?: ',
- 'regex': [r'(\d+\.){3}\d+'],
- 'st': 0.5,
- 'depth': 4
- },
-
- 'Windows': {
- 'log_file': 'Windows/Windows_2k.log',
- 'log_format': ' , ',
- 'regex': [r'0x.*?\s'],
- 'st': 0.7,
- 'depth': 5
- },
-
- 'Linux': {
- 'log_file': 'Linux/Linux_2k.log',
- 'log_format': ' (\[\])?: ',
- 'regex': [r'(\d+\.){3}\d+', r'\d{2}:\d{2}:\d{2}'],
- 'st': 0.39,
- 'depth': 6
- },
-
- 'Andriod': {
- 'log_file': 'Andriod/Andriod_2k.log',
- 'log_format': ' : ',
- 'regex': [r'(/[\w-]+)+', r'([\w-]+\.){2,}[\w-]+', r'\b(\-?\+?\d+)\b|\b0[Xx][a-fA-F\d]+\b|\b[a-fA-F\d]{4,}\b'],
- 'st': 0.2,
- 'depth': 6
- },
-
- 'HealthApp': {
- 'log_file': 'HealthApp/HealthApp_2k.log',
- 'log_format': '\|\|\|',
- 'regex': [],
- 'st': 0.2,
- 'depth': 4
- },
-
- 'Apache': {
- 'log_file': 'Apache/Apache_2k.log',
- 'log_format': '\[\] \[\] ',
- 'regex': [r'(\d+\.){3}\d+'],
- 'st': 0.5,
- 'depth': 4
- },
-
- 'Proxifier': {
- 'log_file': 'Proxifier/Proxifier_2k.log',
- 'log_format': '\[\] - ',
- 'regex': [r'<\d+\ssec', r'([\w-]+\.)+[\w-]+(:\d+)?', r'\d{2}:\d{2}(:\d{2})*', r'[KGTM]B'],
- 'st': 0.6,
- 'depth': 3
- },
-
- 'OpenSSH': {
- 'log_file': 'OpenSSH/OpenSSH_2k.log',
- 'log_format': ' sshd\[\]: ',
- 'regex': [r'(\d+\.){3}\d+', r'([\w-]+\.){2,}[\w-]+'],
- 'st': 0.6,
- 'depth': 5
- },
-
- 'OpenStack': {
- 'log_file': 'OpenStack/OpenStack_2k.log',
- 'log_format': ' \[\] ',
- 'regex': [r'((\d+\.){3}\d+,?)+', r'/.+?\s', r'\d+'],
- 'st': 0.5,
- 'depth': 5
- },
-
- 'Mac': {
- 'log_file': 'Mac/Mac_2k.log',
- 'log_format': ' \[\]( \(\))?: ',
- 'regex': [r'([\w-]+\.){2,}[\w-]+'],
- 'st': 0.7,
- 'depth': 6
- },
-}
-
-bechmark_result = []
-for dataset, setting in benchmark_settings.iteritems():
- print('\n=== Evaluation on %s ==='%dataset)
- indir = os.path.join(input_dir, os.path.dirname(setting['log_file']))
- log_file = os.path.basename(setting['log_file'])
-
- parser = Drain.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir, rex=setting['regex'], depth=setting['depth'], st=setting['st'])
- parser.parse(log_file)
-
- F1_measure, accuracy = evaluator.evaluate(
- groundtruth=os.path.join(indir, log_file + '_structured.csv'),
- parsedresult=os.path.join(output_dir, log_file + '_structured.csv')
- )
- bechmark_result.append([dataset, F1_measure, accuracy])
-
-
-print('\n=== Overall evaluation results ===')
-df_result = pd.DataFrame(bechmark_result, columns=['Dataset', 'F1_measure', 'Accuracy'])
-df_result.set_index('Dataset', inplace=True)
-print(df_result)
-df_result.T.to_csv('Drain_bechmark_result.csv')
diff --git a/benchmark/IPLoM_benchmark.py b/benchmark/IPLoM_benchmark.py
deleted file mode 100644
index 0dcf04db..00000000
--- a/benchmark/IPLoM_benchmark.py
+++ /dev/null
@@ -1,164 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-sys.path.append('../')
-from logparser import IPLoM, evaluator
-import os
-import pandas as pd
-
-
-input_dir = '../logs/' # The input directory of log file
-output_dir = 'IPLoM_result/' # The output directory of parsing results
-
-benchmark_settings = {
- 'HDFS': {
- 'log_file': 'HDFS/HDFS_2k.log',
- 'log_format': ' : ',
- 'CT': 0.35,
- 'lowerBound': 0.25,
- 'regex': [r'blk_-?\d+', r'(\d+\.){3}\d+(:\d+)?']
- },
-
- 'Hadoop': {
- 'log_file': 'Hadoop/Hadoop_2k.log',
- 'log_format': ' \[\] : ',
- 'CT': 0.4,
- 'lowerBound': 0.2,
- 'regex': [r'(\d+\.){3}\d+']
- },
-
- 'Spark': {
- 'log_file': 'Spark/Spark_2k.log',
- 'log_format': ' : ',
- 'CT': 0.35,
- 'lowerBound': 0.3,
- 'regex': [r'(\d+\.){3}\d+', r'\b[KGTM]?B\b', r'([\w-]+\.){2,}[\w-]+']
- },
-
- 'Zookeeper': {
- 'log_file': 'Zookeeper/Zookeeper_2k.log',
- 'log_format': ' - \[:@\] - ',
- 'CT': 0.4,
- 'lowerBound': 0.7,
- 'regex': [r'(/|)(\d+\.){3}\d+(:\d+)?']
- },
-
- 'BGL': {
- 'log_file': 'BGL/BGL_2k.log',
- 'log_format': ' ',
- 'CT': 0.4,
- 'lowerBound': 0.01,
- 'regex': [r'core\.\d+']
- },
-
- 'HPC': {
- 'log_file': 'HPC/HPC_2k.log',
- 'log_format': ' ',
- 'CT': 0.58,
- 'lowerBound': 0.25,
- 'regex': [r'=\d+']
- },
-
- 'Thunderbird': {
- 'log_file': 'Thunderbird/Thunderbird_2k.log',
- 'log_format': '