From 44067c43c0ad2dd5353fa5b04611733a076771b0 Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Tue, 26 Jun 2018 07:38:50 +0200 Subject: [PATCH 01/23] Flink reasoning with Jena datastructures. --- .../inference/flink/data/RDFGraph.scala | 23 ++- .../inference/flink/data/RDFGraphLoader.scala | 6 +- .../extraction/OWLHorstSchemaExtractor.scala | 4 +- .../extraction/RDFSSchemaExtractor.scala | 2 +- .../flink/extraction/SchemaExtractor.scala | 22 +-- .../forwardchaining/ForwardRuleReasoner.scala | 35 ++-- .../ForwardRuleReasonerOWLHorst.scala | 182 +++++++++--------- .../ForwardRuleReasonerRDFS.scala | 180 ++++++++--------- .../forwardchaining/TransitiveReasoner.scala | 39 ++-- 9 files changed, 253 insertions(+), 240 deletions(-) diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala index 66c64a8..b8c388d 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala @@ -1,9 +1,8 @@ package net.sansa_stack.inference.flink.data -import net.sansa_stack.inference.flink.utils.DataSetUtils import org.apache.flink.api.scala.{DataSet, _} -import org.apache.jena.graph.Triple -import net.sansa_stack.inference.data.RDFTriple +import org.apache.jena.graph.{Node, Triple} + import net.sansa_stack.inference.flink.utils.DataSetUtils.DataSetOps /** @@ -12,7 +11,7 @@ import net.sansa_stack.inference.flink.utils.DataSetUtils.DataSetOps * @author Lorenz Buehmann * */ -case class RDFGraph(triples: DataSet[RDFTriple]) { +case class RDFGraph(triples: DataSet[Triple]) { /** * Returns a DataSet of triples that match with the given input. @@ -22,11 +21,11 @@ case class RDFGraph(triples: DataSet[RDFTriple]) { * @param o the object * @return DataSet of triples */ - def find(s: Option[String] = None, p: Option[String] = None, o: Option[String] = None): DataSet[RDFTriple] = { + def find(s: Option[Node] = None, p: Option[Node] = None, o: Option[Node] = None): DataSet[Triple] = { triples.filter(t => - (s == None || t.s == s.get) && - (p == None || t.p == p.get) && - (o == None || t.o == o.get) + (s.isEmpty || t.subjectMatches(s.get)) && + (p.isEmpty || t.predicateMatches(p.get)) && + (o.isEmpty || t.objectMatches(o.get)) ) } @@ -35,11 +34,11 @@ case class RDFGraph(triples: DataSet[RDFTriple]) { * * @return DataSet of triples */ - def find(triple: Triple): DataSet[RDFTriple] = { + def find(triple: Triple): DataSet[Triple] = { find( - if (triple.getSubject.isVariable) None else Option(triple.getSubject.toString), - if (triple.getPredicate.isVariable) None else Option(triple.getPredicate.toString), - if (triple.getObject.isVariable) None else Option(triple.getObject.toString) + if (triple.getSubject.isVariable) None else Option(triple.getSubject), + if (triple.getPredicate.isVariable) None else Option(triple.getPredicate), + if (triple.getObject.isVariable) None else Option(triple.getObject) ) } diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala index a94d1ca..e631d4a 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala @@ -9,6 +9,8 @@ import net.sansa_stack.inference.data.RDFTriple import org.apache.flink.configuration.Configuration import scala.language.implicitConversions +import org.apache.jena.rdf.model.impl.NTripleReader + import net.sansa_stack.inference.utils.NTriplesStringToRDFTriple /** @@ -33,6 +35,7 @@ object RDFGraphLoader { // set the recursive enumeration parameter parameters.setBoolean("recursive.file.enumeration", true) + // pass the configuration to the data source val triples = env.readTextFile(path.toString).withParameters(parameters) .map(line => line.replace(">", "").replace("<", "").split("\\s+")) // line to tokens @@ -48,7 +51,8 @@ object RDFGraphLoader { val converter = new NTriplesStringToRDFTriple() - val triples = tmp.map(f => env.readTextFile(f).flatMap(line => converter.apply(line))).reduce(_ union _).name("triples") + val triples = tmp + .map(f => env.readTextFile(f).flatMap(line => converter.apply(line))).reduce(_ union _).name("triples") RDFGraph(triples) } diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/OWLHorstSchemaExtractor.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/OWLHorstSchemaExtractor.scala index 422deb5..065f8d0 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/OWLHorstSchemaExtractor.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/OWLHorstSchemaExtractor.scala @@ -42,12 +42,12 @@ class OWLHorstSchemaExtractor() OWL2.allValuesFrom, OWL2.hasValue, OWL2.onProperty - ).map(p => p.getURI) + ).map(p => p.asNode()) )( Set( OWL2.TransitiveProperty, OWL2.FunctionalProperty, OWL2.InverseFunctionalProperty, OWL2.SymmetricProperty - ).map(p => p.getURI) + ).map(p => p.asNode()) ) {} diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/RDFSSchemaExtractor.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/RDFSSchemaExtractor.scala index 945a0d8..086671c 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/RDFSSchemaExtractor.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/RDFSSchemaExtractor.scala @@ -15,7 +15,7 @@ import org.apache.jena.vocabulary.RDFS * @author Lorenz Buehmann */ class RDFSSchemaExtractor() - extends SchemaExtractor()(Set(RDFS.subClassOf, RDFS.subPropertyOf, RDFS.domain, RDFS.range).map(p => p.getURI))() {} + extends SchemaExtractor()(Set(RDFS.subClassOf, RDFS.subPropertyOf, RDFS.domain, RDFS.range).map(p => p.asNode()))() {} object RDFSSchemaExtractor { def apply: RDFSSchemaExtractor = new RDFSSchemaExtractor() diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/SchemaExtractor.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/SchemaExtractor.scala index 8b32c8e..10348a8 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/SchemaExtractor.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/SchemaExtractor.scala @@ -1,9 +1,8 @@ package net.sansa_stack.inference.flink.extraction import org.apache.flink.api.scala.DataSet -import org.apache.jena.vocabulary.RDFS +import org.apache.jena.graph.{Node, Triple} -import net.sansa_stack.inference.data.RDFTriple import net.sansa_stack.inference.flink.data.RDFGraph import net.sansa_stack.inference.utils.Logging @@ -11,16 +10,16 @@ import net.sansa_stack.inference.utils.Logging * @author Lorenz Buehmann */ abstract class SchemaExtractor - (subjects: Set[String] = Set()) - (predicates: Set[String] = Set()) - (objects: Set[String] = Set()) + (subjects: Set[Node] = Set()) + (predicates: Set[Node] = Set()) + (objects: Set[Node] = Set()) extends Logging with Serializable{ - val subjectsFilter: ((RDFTriple) => Boolean) = t => subjects.contains(t.s) - val predicatesFilter: ((RDFTriple) => Boolean) = t => predicates.contains(t.p) - val objectsFilter: ((RDFTriple) => Boolean) = t => objects.contains(t.o) + val subjectsFilter: ((Triple) => Boolean) = t => subjects.contains(t.getSubject) + val predicatesFilter: ((Triple) => Boolean) = t => predicates.contains(t.getPredicate) + val objectsFilter: ((Triple) => Boolean) = t => objects.contains(t.getObject) - private def or(ps: (RDFTriple => Boolean)*) = (a: RDFTriple) => ps.exists(_(a)) + private def or(ps: (Triple => Boolean)*) = (a: Triple) => ps.exists(_(a)) /** * Extract a graph that contains only the schema triples. @@ -28,8 +27,7 @@ abstract class SchemaExtractor * @param graph the graph * @return a graph containing only the schema triples */ - def extract(graph: RDFGraph): RDFGraph = - new RDFGraph(extract(graph.triples)) + def extract(graph: RDFGraph): RDFGraph = RDFGraph(extract(graph.triples)) /** * Extract a DataSet that contains only the schema triples. @@ -37,7 +35,7 @@ abstract class SchemaExtractor * @param triples the triples * @return the schema triples */ - def extract(triples: DataSet[RDFTriple]): DataSet[RDFTriple] = + def extract(triples: DataSet[Triple]): DataSet[Triple] = triples .filter(or(subjectsFilter, predicatesFilter, objectsFilter)) .name("schema-triples") diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasoner.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasoner.scala index c5bda4d..45e5cca 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasoner.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasoner.scala @@ -1,11 +1,11 @@ package net.sansa_stack.inference.flink.forwardchaining -import net.sansa_stack.inference.data.RDFTriple import net.sansa_stack.inference.flink.data.RDFGraph import org.apache.flink.api.scala.DataSet - import scala.collection.mutable +import org.apache.jena.graph.{Node, Triple} + /** * A forward chaining based reasoner. * @@ -22,6 +22,15 @@ trait ForwardRuleReasoner extends TransitiveReasoner{ */ def apply(graph: RDFGraph) : RDFGraph + /** + * Applies forward chaining to the given set of RDF triples and returns a new set of RDF triples that + * contains all additional triples based on the underlying set of rules. + * + * @param triples the RDF triples + * @return the materialized RDF triples + */ + def apply(triples: DataSet[Triple]) : DataSet[Triple] = apply(RDFGraph(triples)).triples + /** * Extracts all triples for the given predicate. * @@ -29,8 +38,8 @@ trait ForwardRuleReasoner extends TransitiveReasoner{ * @param predicate the predicate * @return the set of triples that contain the predicate */ - def extractTriples(triples: mutable.Set[RDFTriple], predicate: String): mutable.Set[RDFTriple] = { - triples.filter(triple => triple.p == predicate) + def extractTriples(triples: mutable.Set[Triple], predicate: Node): mutable.Set[Triple] = { + triples.filter(triple => triple.predicateMatches(predicate)) } /** @@ -40,8 +49,8 @@ trait ForwardRuleReasoner extends TransitiveReasoner{ * @param predicate the predicate * @return the DataSet of triples that contain the predicate */ - def extractTriples(triples: DataSet[RDFTriple], predicate: String): DataSet[RDFTriple] = { - triples.filter(triple => triple.p == predicate) + def extractTriples(triples: DataSet[Triple], predicate: Node): DataSet[Triple] = { + triples.filter(triple => triple.predicateMatches(predicate)) } /** @@ -53,22 +62,22 @@ trait ForwardRuleReasoner extends TransitiveReasoner{ * @param obj the object * @return the DataSet of triples that match */ - def extractTriples(triples: DataSet[RDFTriple], - subject: Option[String], - predicate: Option[String], - obj: Option[String]): DataSet[RDFTriple] = { + def extractTriples(triples: DataSet[Triple], + subject: Option[Node], + predicate: Option[Node], + obj: Option[Node]): DataSet[Triple] = { var extractedTriples = triples if(subject.isDefined) { - extractedTriples = extractedTriples.filter(triple => triple.s == subject.get) + extractedTriples = extractedTriples.filter(triple => triple.subjectMatches(subject.get)) } if(predicate.isDefined) { - extractedTriples = extractedTriples.filter(triple => triple.p == predicate.get) + extractedTriples = extractedTriples.filter(triple => triple.predicateMatches(predicate.get)) } if(obj.isDefined) { - extractedTriples = extractedTriples.filter(triple => triple.o == obj.get) + extractedTriples = extractedTriples.filter(triple => triple.objectMatches(obj.get)) } extractedTriples diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerOWLHorst.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerOWLHorst.scala index 68db1f1..2093e52 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerOWLHorst.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerOWLHorst.scala @@ -1,12 +1,13 @@ package net.sansa_stack.inference.flink.forwardchaining -import net.sansa_stack.inference.flink.data.RDFGraph import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment, _} +import org.apache.jena.graph.{Node, Triple} import org.apache.jena.vocabulary.{OWL2, RDF, RDFS} -import net.sansa_stack.inference.data.RDFTriple -import net.sansa_stack.inference.utils.CollectionUtils import org.slf4j.LoggerFactory +import net.sansa_stack.inference.flink.data.RDFGraph +import net.sansa_stack.inference.utils.CollectionUtils + /** * A forward chaining implementation of the OWL Horst entailment regime. * @@ -26,26 +27,26 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // extract the schema data - var subClassOfTriples = extractTriples(triplesRDD, RDFS.subClassOf.getURI) // rdfs:subClassOf - var subPropertyOfTriples = extractTriples(triplesRDD, RDFS.subPropertyOf.getURI) // rdfs:subPropertyOf - val domainTriples = extractTriples(triplesRDD, RDFS.domain.getURI) // rdfs:domain - val rangeTriples = extractTriples(triplesRDD, RDFS.range.getURI) // rdfs:range - val equivClassTriples = extractTriples(triplesRDD, OWL2.equivalentClass.getURI) // owl:equivalentClass - val equivPropertyTriples = extractTriples(triplesRDD, OWL2.equivalentProperty.getURI) // owl:equivalentProperty + var subClassOfTriples = extractTriples(triplesRDD, RDFS.subClassOf.asNode()) // rdfs:subClassOf + var subPropertyOfTriples = extractTriples(triplesRDD, RDFS.subPropertyOf.asNode()) // rdfs:subPropertyOf + val domainTriples = extractTriples(triplesRDD, RDFS.domain.asNode()) // rdfs:domain + val rangeTriples = extractTriples(triplesRDD, RDFS.range.asNode()) // rdfs:range + val equivClassTriples = extractTriples(triplesRDD, OWL2.equivalentClass.asNode()) // owl:equivalentClass + val equivPropertyTriples = extractTriples(triplesRDD, OWL2.equivalentProperty.asNode()) // owl:equivalentProperty // 1. we have to process owl:equivalentClass and owl:equivalentProperty before computing the transitive closure // rdfp12a: (?C owl:equivalentClass ?D) -> (?C rdfs:subClassOf ?D ) - val tmp_12a = equivClassTriples.map(t => RDFTriple(t.s, RDFS.subClassOf.getURI, t.o)) + val tmp_12a = equivClassTriples.map(t => Triple.create(t.getSubject, RDFS.subClassOf.asNode(), t.getObject)) // rdfp12b: (?C owl:equivalentClass ?D) -> (?D rdfs:subClassOf ?C ) - val tmp_12b = equivClassTriples.map(t => RDFTriple(t.o, RDFS.subClassOf.getURI, t.s)) + val tmp_12b = equivClassTriples.map(t => Triple.create(t.getObject, RDFS.subClassOf.asNode(), t.getSubject)) subClassOfTriples = env.union(Seq(subClassOfTriples, tmp_12a, tmp_12b)) .distinct() // rdfp13a: (?C owl:equivalentProperty ?D) -> (?C rdfs:subPropertyOf ?D ) - val tmp_13a = equivPropertyTriples.map(t => RDFTriple(t.s, RDFS.subPropertyOf.getURI, t.o)) + val tmp_13a = equivPropertyTriples.map(t => Triple.create(t.getSubject, RDFS.subPropertyOf.asNode(), t.getObject)) // rdfp13b: (?C owl:equivalentProperty ?D) -> (?D rdfs:subPropertyOf ?C ) - val tmp_13b = equivPropertyTriples.map(t => RDFTriple(t.o, RDFS.subPropertyOf.getURI, t.s)) + val tmp_13b = equivPropertyTriples.map(t => Triple.create(t.getObject, RDFS.subPropertyOf.asNode(), t.getSubject)) subPropertyOfTriples = env.union(Seq(subPropertyOfTriples, tmp_13a, tmp_13b)) .distinct() @@ -57,10 +58,10 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // we put all into maps which should be more efficient later on - val subClassOfMap = CollectionUtils.toMultiMap(subClassOfTriplesTrans.map(t => (t.s, t.o)).collect) - val subPropertyMap = CollectionUtils.toMultiMap(subPropertyOfTriplesTrans.map(t => (t.s, t.o)).collect) - val domainMap = domainTriples.map(t => (t.s, t.o)).collect.toMap - val rangeMap = rangeTriples.map(t => (t.s, t.o)).collect.toMap + val subClassOfMap = CollectionUtils.toMultiMap(subClassOfTriplesTrans.map(t => (t.getSubject, t.getObject)).collect) + val subPropertyMap = CollectionUtils.toMultiMap(subPropertyOfTriplesTrans.map(t => (t.getSubject, t.getObject)).collect) + val domainMap = domainTriples.map(t => (t.getSubject, t.getObject)).collect.toMap + val rangeMap = rangeTriples.map(t => (t.getSubject, t.getObject)).collect.toMap // TODO broadcast schema in with Flink // // distribute the schema data structures by means of shared variables @@ -74,39 +75,39 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // rdfp12c: (?C rdfs:subClassOf ?D ), (?D rdfs:subClassOf ?C ) -> (?C owl:equivalentClass ?D) val equivClassTriplesInf = equivClassTriples.union( subClassOfTriplesTrans - .filter(t => subClassOfMap.getOrElse(t.o, Set.empty).contains(t.s)) - .map(t => RDFTriple(t.s, OWL2.equivalentClass.getURI, t.o)) + .filter(t => subClassOfMap.getOrElse(t.getObject, Set.empty).contains(t.getSubject)) + .map(t => Triple.create(t.getSubject, OWL2.equivalentClass.asNode(), t.getObject)) ) // rdfp13c: (?C rdfs:subPropertyOf ?D ), (?D rdfs:subPropertyOf ?C ) -> (?C owl:equivalentProperty ?D) val equivPropTriplesInf = equivPropertyTriples.union( subPropertyOfTriplesTrans - .filter(t => subPropertyMap.getOrElse(t.o, Set.empty).contains(t.s)) - .map(t => RDFTriple(t.s, OWL2.equivalentProperty.getURI, t.o)) + .filter(t => subPropertyMap.getOrElse(t.getObject, Set.empty).contains(t.getSubject)) + .map(t => Triple.create(t.getSubject, OWL2.equivalentProperty.asNode(), t.getObject)) ) // we also extract properties with certain OWL characteristic and share them val transitiveProperties = - extractTriples(triplesRDD, None, None, Some(OWL2.TransitiveProperty.getURI)) - .map(triple => triple.s) + extractTriples(triplesRDD, None, None, Some(OWL2.TransitiveProperty.asNode())) + .map(triple => triple.getSubject) .collect() val functionalProperties = - extractTriples(triplesRDD, None, None, Some(OWL2.FunctionalProperty.getURI)) - .map(triple => triple.s) + extractTriples(triplesRDD, None, None, Some(OWL2.FunctionalProperty.asNode())) + .map(triple => triple.getSubject) .collect() val inverseFunctionalProperties = - extractTriples(triplesRDD, None, None, Some(OWL2.InverseFunctionalProperty.getURI)) - .map(triple => triple.s) + extractTriples(triplesRDD, None, None, Some(OWL2.InverseFunctionalProperty.asNode())) + .map(triple => triple.getSubject) .collect() val symmetricProperties = - extractTriples(triplesRDD, None, None, Some(OWL2.SymmetricProperty.getURI)) - .map(triple => triple.s) + extractTriples(triplesRDD, None, None, Some(OWL2.SymmetricProperty.asNode())) + .map(triple => triple.getSubject) .collect() // and inverse property definitions val inverseOfMap = - extractTriples(triplesRDD, None, Some(OWL2.inverseOf.getURI), None) - .map(triple => (triple.s, triple.o)) + extractTriples(triplesRDD, None, Some(OWL2.inverseOf.asNode()), None) + .map(triple => (triple.getSubject, triple.getObject)) .collect() .toMap val inverseOfMapReverted = inverseOfMap.map(_.swap) @@ -114,38 +115,39 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // and more OWL vocabulary used in property restrictions // owl:someValuesFrom val someValuesFromMap = - extractTriples(triplesRDD, None, Some(OWL2.someValuesFrom.getURI), None) - .map(triple => (triple.s, triple.o)) + extractTriples(triplesRDD, None, Some(OWL2.someValuesFrom.asNode()), None) + .map(triple => (triple.getSubject, triple.getObject)) .collect() .toMap val someValuesFromMapReversed = someValuesFromMap.map(_.swap) // owl:allValuesFrom val allValuesFromMap = - extractTriples(triplesRDD, None, Some(OWL2.allValuesFrom.getURI), None) - .map(triple => (triple.s, triple.o)) + extractTriples(triplesRDD, None, Some(OWL2.allValuesFrom.asNode()), None) + .map(triple => (triple.getSubject, triple.getObject)) .collect() .toMap val allValuesFromMapReversed = allValuesFromMap.map(_.swap) // owl:hasValue val hasValueMap = - extractTriples(triplesRDD, None, Some(OWL2.hasValue.getURI), None) - .map(triple => (triple.s, triple.o)) + extractTriples(triplesRDD, None, Some(OWL2.hasValue.asNode()), None) + .map(triple => (triple.getSubject, triple.getObject)) .collect() .toMap val hasValueMapReversed = hasValueMap.groupBy(_._2).mapValues(_.keys).map(identity) // owl:onProperty val onPropertyMap = - extractTriples(triplesRDD, None, Some(OWL2.onProperty.getURI), None) - .map(triple => (triple.s, triple.o)) + extractTriples(triplesRDD, None, Some(OWL2.onProperty.asNode()), None) + .map(triple => (triple.getSubject, triple.getObject)) .collect() .toMap val onPropertyMapReversed = onPropertyMap.groupBy(_._2).mapValues(_.keys).map(identity) // owl:sameAs is computed separately, thus, we split the data - var triplesFiltered = triplesRDD.filter(triple => triple.p != OWL2.sameAs.getURI && triple.p != RDF.`type`.getURI) - var sameAsTriples = triplesRDD.filter(triple => triple.p == OWL2.sameAs.getURI) - var typeTriples = triplesRDD.filter(triple => triple.p == RDF.`type`.getURI) + var triplesFiltered = triplesRDD.filter(triple => !triple.predicateMatches(OWL2.sameAs.asNode()) + && !triple.predicateMatches(RDF.`type`.asNode())) + var sameAsTriples = triplesRDD.filter(triple => triple.predicateMatches(OWL2.sameAs.asNode())) + var typeTriples = triplesRDD.filter(triple => triple.predicateMatches(RDF.`type`.asNode())) // println("input rdf:type triples:\n" + typeTriples.collect().mkString("\n")) @@ -164,8 +166,8 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule */ val triplesRDFS7 = triplesFiltered - .filter(t => subPropertyMap.contains(t.p)) - .flatMap(t => subPropertyMap(t.p).map(supProp => RDFTriple(t.s, supProp, t.o))) + .filter(t => subPropertyMap.contains(t.getPredicate)) + .flatMap(t => subPropertyMap(t.getPredicate).map(supProp => Triple.create(t.getSubject, supProp, t.getObject))) // add the inferred triples to the existing triples val rdfs7Res = triplesRDFS7.union(triplesFiltered) @@ -178,8 +180,8 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule */ val triplesRDFS2 = rdfs7Res - .filter(t => domainMap.contains(t.p)) - .map(t => RDFTriple(t.s, RDF.`type`.getURI, domainMap(t.p))) + .filter(t => domainMap.contains(t.getPredicate)) + .map(t => Triple.create(t.getSubject, RDF.`type`.asNode(), domainMap(t.getPredicate))) /* rdfs3 aaa rdfs:range xxx . @@ -187,8 +189,8 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule */ val triplesRDFS3 = rdfs7Res - .filter(t => rangeMap.contains(t.p)) - .map(t => RDFTriple(t.o, RDF.`type`.getURI, rangeMap(t.p))) + .filter(t => rangeMap.contains(t.getPredicate)) + .map(t => Triple.create(t.getObject, RDF.`type`.asNode(), rangeMap(t.getPredicate))) // 4. SubClass inheritance according to rdfs9 @@ -202,18 +204,18 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule triplesRDFS2 .union(triplesRDFS3) .union(typeTriples) - .filter(t => subClassOfMap.contains(t.o)) // such that A has a super class B - .flatMap(t => subClassOfMap(t.o).map(supCls => RDFTriple(t.s, RDF.`type`.getURI, supCls))) // create triple (s a B) + .filter(t => subClassOfMap.contains(t.getObject)) // such that A has a super class B + .flatMap(t => subClassOfMap(t.getObject).map(supCls => Triple.create(t.getSubject, RDF.`type`.asNode(), supCls))) // create triple (s a B) // rdfp14b: (?R owl:hasValue ?V),(?R owl:onProperty ?P),(?X rdf:type ?R ) -> (?X ?P ?V ) val rdfp14b = typeTriples .filter(triple => - hasValueMap.contains(triple.o) && - onPropertyMap.contains(triple.o) + hasValueMap.contains(triple.getObject) && + onPropertyMap.contains(triple.getObject) ) .map(triple => - RDFTriple(triple.s, onPropertyMap(triple.o), hasValueMap(triple.o)) + Triple.create(triple.getSubject, onPropertyMap(triple.getObject), hasValueMap(triple.getObject)) ) // rdfp14a: (?R owl:hasValue ?V), (?R owl:onProperty ?P), (?U ?P ?V) -> (?U rdf:type ?R) @@ -221,12 +223,12 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule val rdfp14a = rdfs7Res .filter(triple => { var valueRestrictionExists = false - if (onPropertyMapReversed.contains(triple.p)) { + if (onPropertyMapReversed.contains(triple.getPredicate)) { // there is any restriction R for property P - onPropertyMapReversed(triple.p).foreach { restriction => + onPropertyMapReversed(triple.getPredicate).foreach { restriction => if (hasValueMap.contains(restriction) && // R a hasValue restriction - hasValueMap(restriction) == triple.o) { + hasValueMap(restriction) == triple.getObject) { // with value V valueRestrictionExists = true } @@ -236,74 +238,74 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule }) .map(triple => { - val s = triple.s - val p = RDF.`type`.getURI - var o = "" - onPropertyMapReversed(triple.p).foreach { restriction => // get the restriction R + val s = triple.getSubject + val p = RDF.`type`.asNode() + var o: Node = null + onPropertyMapReversed(triple.getPredicate).foreach { restriction => // get the restriction R if (hasValueMap.contains(restriction) && // R a hasValue restriction - hasValueMap(restriction) == triple.o) { // with value V + hasValueMap(restriction) == triple.getObject) { // with value V o = restriction } } - RDFTriple(s, p, o) + Triple.create(s, p, o) } ) println(rdfp14a.collect().mkString("\n")) // rdfp8a: (?P owl:inverseOf ?Q), (?X ?P ?Y) -> (?Y ?Q ?X) val rdfp8a = triplesFiltered - .filter(triple => inverseOfMap.contains(triple.p)) - .map(triple => RDFTriple(triple.o, inverseOfMap(triple.p), triple.s)) + .filter(triple => inverseOfMap.contains(triple.getPredicate)) + .map(triple => Triple.create(triple.getObject, inverseOfMap(triple.getPredicate), triple.getSubject)) // rdfp8b: (?P owl:inverseOf ?Q), (?X ?Q ?Y) -> (?Y ?P ?X) val rdfp8b = triplesFiltered - .filter(triple => inverseOfMapReverted.contains(triple.p)) - .map(triple => RDFTriple(triple.o, inverseOfMapReverted(triple.p), triple.s)) + .filter(triple => inverseOfMapReverted.contains(triple.getPredicate)) + .map(triple => Triple.create(triple.getObject, inverseOfMapReverted(triple.getPredicate), triple.getSubject)) // rdfp3: (?P rdf:type owl:SymmetricProperty), (?X ?P ?Y) -> (?Y ?P ?X) val rdfp3 = triplesFiltered - .filter(triple => symmetricProperties.contains(triple.p)) - .map(triple => RDFTriple(triple.o, triple.p, triple.s)) + .filter(triple => symmetricProperties.contains(triple.getPredicate)) + .map(triple => Triple.create(triple.getObject, triple.getPredicate, triple.getSubject)) // rdfp15: (?R owl:someValuesFrom ?D), (?R owl:onProperty ?P), (?X ?P ?A), (?A rdf:type ?D ) -> (?X rdf:type ?R ) val rdfp15_1 = triplesFiltered - .filter(triple => onPropertyMapReversed.contains(triple.p)) // && someValuesFromMapBC.value.contains(onPropertyMapReversedBC.value(triple.predicate))) + .filter(triple => onPropertyMapReversed.contains(triple.getPredicate)) // && someValuesFromMapBC.value.contains(onPropertyMapReversedBC.value(triple.predicate))) .flatMap(triple => { - val restrictions = onPropertyMapReversed(triple.p) - restrictions.map(_r => (_r -> triple.o, triple.s)) // -> ((?R, ?A), ?X) + val restrictions = onPropertyMapReversed(triple.getPredicate) + restrictions.map(_r => (_r -> triple.getObject, triple.getSubject)) // -> ((?R, ?A), ?X) }) // .flatMap(identity) val rdfp15_2 = typeTriples - .filter(triple => someValuesFromMapReversed.contains(triple.o)) - .map(triple => ((someValuesFromMapReversed(triple.o), triple.s), "s")) // -> ((?R, ?A), NIL) + .filter(triple => someValuesFromMapReversed.contains(triple.getObject)) + .map(triple => ((someValuesFromMapReversed(triple.getObject), triple.getSubject), "s")) // -> ((?R, ?A), NIL) val rdfp15 = rdfp15_1 .join(rdfp15_2).where(0).equalTo(0)({// ((?R, ?A), ?X) x ((?R, ?A), NIL) (l, r) => (l._2, r._1._1) }) - .map(e => RDFTriple(e._1, RDF.`type`.getURI, e._2)) // -> (?X rdf:type ?R ) + .map(e => Triple.create(e._1, RDF.`type`.asNode(), e._2)) // -> (?X rdf:type ?R ) // println(rdfp15.collect().mkString("\n")) // rdfp16: (?R owl:allValuesFrom ?D), (?R owl:onProperty ?P), (?X ?P ?Y), (?X rdf:type ?R ) -> (?Y rdf:type ?D ) val rdfp16_1 = triplesFiltered // (?X ?P ?Y) - .filter(triple => onPropertyMapReversed.contains(triple.p) && - allValuesFromMap.keySet.intersect(onPropertyMapReversed(triple.p).toSet).nonEmpty) // (?R owl:allValuesFrom ?D), (?R owl:onProperty ?P) + .filter(triple => onPropertyMapReversed.contains(triple.getPredicate) && + allValuesFromMap.keySet.intersect(onPropertyMapReversed(triple.getPredicate).toSet).nonEmpty) // (?R owl:allValuesFrom ?D), (?R owl:onProperty ?P) .flatMap(triple => { - val restrictions = onPropertyMapReversed(triple.p) - restrictions.map(_r => (triple.s -> _r, triple.o)) // -> ((?X, ?R), ?Y) + val restrictions = onPropertyMapReversed(triple.getPredicate) + restrictions.map(_r => (triple.getSubject -> _r, triple.getObject)) // -> ((?X, ?R), ?Y) }) // .flatMap(identity) // println("rdfp16_1:\n" + rdfp16_1.collect().mkString("\n")) val rdfp16_2 = typeTriples // (?X rdf:type ?R ) - .filter(triple => allValuesFromMap.contains(triple.o) && onPropertyMap.contains(triple.o)) // (?R owl:allValuesFrom ?D), (?R owl:onProperty ?P) - .map(triple => ((triple.s, triple.o), allValuesFromMap(triple.o))) // -> ((?X, ?R), ?D) + .filter(triple => allValuesFromMap.contains(triple.getObject) && onPropertyMap.contains(triple.getObject)) // (?R owl:allValuesFrom ?D), (?R owl:onProperty ?P) + .map(triple => ((triple.getSubject, triple.getObject), allValuesFromMap(triple.getObject))) // -> ((?X, ?R), ?D) // println("rdfp16_2:\n" + rdfp16_2.collect().mkString("\n")) @@ -311,7 +313,7 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule .join(rdfp16_2).where(0).equalTo(0) ({// ((?X, ?R), ?Y) x ((?X, ?R), ?D) (l, r) => (l._2, r._2) // -> (Y, D) }) - .map(e => RDFTriple(e._1, RDF.`type`.getURI, e._2)) // -> (?Y rdf:type ?D ) + .map(e => Triple.create(e._1, RDF.`type`.asNode(), e._2)) // -> (?Y rdf:type ?D ) // println(rdfp15.collect().mkString("\n")) // deduplicate @@ -326,7 +328,7 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule triplesFiltered = triplesFiltered.union(triplesNew) // rdfp4: (?P rdf:type owl:TransitiveProperty), (?X ?P ?Y), (?Y ?P ?Z) -> (?X ?P ?Z) - val rdfp4 = computeTransitiveClosure(triplesFiltered.filter(triple => transitiveProperties.contains(triple.p))) + val rdfp4 = computeTransitiveClosure(triplesFiltered.filter(triple => transitiveProperties.contains(triple.getPredicate))) // add triples triplesFiltered = triplesFiltered.union(rdfp4) @@ -352,8 +354,8 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // rdfp1: (?P rdf:type owl:FunctionalProperty), (?A ?P ?B), notLiteral(?B), (?A ?P ?C), notLiteral(?C), notEqual(?B ?C) -> (?B owl:sameAs ?C) val rdfp1_1 = triplesFiltered - .filter(triple => functionalProperties.contains(triple.p)) - .map(triple => (triple.s, triple.p) -> triple.o) // -> ((?A, ?P), ?B) + .filter(triple => functionalProperties.contains(triple.getPredicate)) + .map(triple => (triple.getSubject, triple.getPredicate) -> triple.getObject) // -> ((?A, ?P), ?B) // println(rdfp1_1.collect().mkString("\n")) // println("Joined:" + rdfp1_1.join(rdfp1_1).collect().mkString("\n")) // apply self join @@ -362,18 +364,18 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule (l, r) => (l._2, r._2) // -> (?B, ?C) }) .filter(e => e._1 != e._2) // notEqual(?B ?C) - .map(e => RDFTriple(e._1, OWL2.sameAs.getURI, e._2)) // -> (?B owl:sameAs ?C) + .map(e => Triple.create(e._1, OWL2.sameAs.asNode(), e._2)) // -> (?B owl:sameAs ?C) // rdfp2: (?P rdf:type owl:InverseFunctionalProperty), (?A ?P ?B), (?C ?P ?B), notEqual(?A ?C) -> (?A owl:sameAs ?C) val rdfp2_1 = triplesFiltered - .filter(triple => inverseFunctionalProperties.contains(triple.p)) - .map(triple => (triple.o, triple.p) -> triple.s) // -> ((?B, ?P), ?A) + .filter(triple => inverseFunctionalProperties.contains(triple.getPredicate)) + .map(triple => (triple.getObject, triple.getPredicate) -> triple.getSubject) // -> ((?B, ?P), ?A) val rdfp2 = rdfp2_1 .join(rdfp2_1).where(0).equalTo(0) ({// ((?B, ?P), ?A) x ((?B, ?P), ?C) (l, r) => (l._2, r._2) // -> (?A, ?C) }) .filter(e => e._1 != e._2) // notEqual(?A ?C) - .map(e => RDFTriple(e._1, OWL2.sameAs.getURI, e._2)) // -> (?A owl:sameAs ?C) + .map(e => Triple.create(e._1, OWL2.sameAs.asNode(), e._2)) // -> (?A owl:sameAs ?C) triplesFiltered = triplesFiltered.union(rdfp1).union(rdfp2) @@ -400,7 +402,7 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule RDFGraph(inferredTriples) } - def deduplicate(triples: DataSet[RDFTriple]): DataSet[RDFTriple] = { + def deduplicate(triples: DataSet[Triple]): DataSet[Triple] = { triples.distinct() } @@ -421,7 +423,7 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // // val rdfp15 = rdfp15_1 // .join(rdfp15_2) -// .map(e => RDFTriple(e._2._1, RDF.`type`.getURI, e._1._1)) // -> (?X rdf:type ?R ) +// .map(e => Triple.create(e._2._1, RDF.`type`.asNode(), e._1._1)) // -> (?X rdf:type ?R ) // // rdfp15 // } diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala index 88d2c95..a1538b7 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala @@ -4,13 +4,13 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.flink.api.common.functions.{RichFilterFunction, RichFlatMapFunction} -import org.apache.flink.api.scala.{ExecutionEnvironment, _} +import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment, _} import org.apache.flink.configuration.Configuration import org.apache.flink.util.Collector +import org.apache.jena.graph.Triple import org.apache.jena.vocabulary.{RDF, RDFS} import org.slf4j.LoggerFactory -import net.sansa_stack.inference.data.RDFTriple import net.sansa_stack.inference.flink.data.RDFGraph import net.sansa_stack.inference.flink.extraction.RDFSSchemaExtractor import net.sansa_stack.inference.flink.utils.DataSetUtils.DataSetOps @@ -54,7 +54,7 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas * yyy rdfs:subClassOf zzz . xxx rdfs:subClassOf zzz . */ val subClassOfTriples = - extractTriples(schemaTriples, RDFS.subClassOf.getURI) + extractTriples(schemaTriples, RDFS.subClassOf.asNode()) .name("rdfs:subClassOf") // extract rdfs:subClassOf triples val subClassOfTriplesTrans = computeTransitiveClosureOptSemiNaive(subClassOfTriples).name("rdfs11") @@ -64,13 +64,13 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas yyy rdfs:subPropertyOf zzz . => xxx rdfs:subPropertyOf zzz . */ val subPropertyOfTriples = - extractTriples(schemaTriples, RDFS.subPropertyOf.getURI) + extractTriples(schemaTriples, RDFS.subPropertyOf.asNode()) .name("rdfs:subPropertyOf") // extract rdfs:subPropertyOf triples val subPropertyOfTriplesTrans = computeTransitiveClosureOptSemiNaive(subPropertyOfTriples).name("rdfs5") // split by rdf:type - val split = triplesDS.partitionBy(t => t.p == RDF.`type`.getURI) + val split = triplesDS.partitionBy(t => t.predicateMatches(RDF.`type`.asNode())) var typeTriples = split._1 var otherTriples = split._2 @@ -82,39 +82,39 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas */ val triplesRDFS7 = if (useSchemaBroadCasting) { otherTriples - .filter(new RichFilterFunction[RDFTriple]() { + .filter(new RichFilterFunction[Triple]() { - var broadcastSet: Traversable[RDFTriple] = _ + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple]("subPropertyTriples").asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple]("subPropertyTriples").asScala } - override def filter(t: RDFTriple): Boolean = broadcastSet.exists(_.s == t.p) + override def filter(t: Triple): Boolean = broadcastSet.exists(_.subjectMatches(t.getPredicate)) }) .withBroadcastSet(subPropertyOfTriplesTrans, "subPropertyTriples") // .flatMap(new SubClassOfFlatMapFunction("subClasses")).withBroadcastSet(subClassOfTriplesTrans, "subClasses") // create triple (s a B) - .flatMap(new RichFlatMapFunction[RDFTriple, RDFTriple]() { - var broadcastSet: Traversable[RDFTriple] = _ + .flatMap(new RichFlatMapFunction[Triple, Triple]() { + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple]("subPropertyTriples").asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple]("subPropertyTriples").asScala } - override def flatMap(in: RDFTriple, collector: Collector[RDFTriple]): Unit = { + override def flatMap(in: Triple, collector: Collector[Triple]): Unit = { broadcastSet - .filter(_.s == in.p) - .foreach(t => collector.collect(RDFTriple(in.s, t.o, in.o))) + .filter(_.subjectMatches(in.getPredicate)) + .foreach(t => collector.collect(Triple.create(in.getSubject, t.getObject, in.getObject))) } }) .withBroadcastSet(subPropertyOfTriplesTrans, "subPropertyTriples") } else { val subPropertyMap = - CollectionUtils.toMultiMap(subPropertyOfTriplesTrans.map(t => (t.s, t.o)).collect) + CollectionUtils.toMultiMap(subPropertyOfTriplesTrans.map(t => (t.getSubject, t.getObject)).collect) otherTriples // all triples (s p1 o) - .filter(t => subPropertyMap.contains(t.p)) // such that p1 has a super property p2 - .flatMap(t => subPropertyMap(t.p).map(supProp => RDFTriple(t.s, supProp, t.o))) // create triple (s p2 o) + .filter(t => subPropertyMap.contains(t.getPredicate)) // such that p1 has a super property p2 + .flatMap(t => subPropertyMap(t.getPredicate).map(supProp => Triple.create(t.getSubject, supProp, t.getObject))) // create triple (s p2 o) }.name("rdfs7") @@ -128,42 +128,42 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas yyy aaa zzz . => yyy rdf:type xxx . */ val domainTriples = - extractTriples(schemaTriples, RDFS.domain.getURI).name("rdfs:domain") + extractTriples(schemaTriples, RDFS.domain.asNode()).name("rdfs:domain") val triplesRDFS2 = if (useSchemaBroadCasting) { otherTriples - .filter(new RichFilterFunction[RDFTriple]() { + .filter(new RichFilterFunction[Triple]() { - var broadcastSet: Traversable[RDFTriple] = _ + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple]("domainTriples").asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple]("domainTriples").asScala } - override def filter(t: RDFTriple): Boolean = - broadcastSet.exists(_.s == t.p) + override def filter(t: Triple): Boolean = + broadcastSet.exists(_.subjectMatches(t.getPredicate)) }) .withBroadcastSet(domainTriples, "domainTriples") - .flatMap(new RichFlatMapFunction[RDFTriple, RDFTriple]() { - var broadcastSet: Traversable[RDFTriple] = _ + .flatMap(new RichFlatMapFunction[Triple, Triple]() { + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple]("domainTriples").asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple]("domainTriples").asScala } - override def flatMap(in: RDFTriple, collector: Collector[RDFTriple]): Unit = { + override def flatMap(in: Triple, collector: Collector[Triple]): Unit = { broadcastSet - .filter(_.s == in.p) - .foreach(t => collector.collect(RDFTriple(in.s, RDF.`type`.getURI, t.o))) + .filter(_.subjectMatches(in.getPredicate)) + .foreach(t => collector.collect(Triple.create(in.getSubject, RDF.`type`.asNode(), t.getObject))) } }) .withBroadcastSet(domainTriples, "domainTriples") } else { - val domainMap = domainTriples.map(t => (t.s, t.o)).collect.toMap + val domainMap = domainTriples.map(t => (t.getSubject, t.getObject)).collect.toMap otherTriples - .filter(t => domainMap.contains(t.p)) - .map(t => RDFTriple(t.s, RDF.`type`.getURI, domainMap(t.p))) + .filter(t => domainMap.contains(t.getPredicate)) + .map(t => Triple.create(t.getSubject, RDF.`type`.asNode(), domainMap(t.getPredicate))) }.name("rdfs2") @@ -172,42 +172,42 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas yyy aaa zzz . => zzz rdf:type xxx . */ val rangeTriples = - extractTriples(schemaTriples, RDFS.range.getURI).name("rdfs:range") + extractTriples(schemaTriples, RDFS.range.asNode()).name("rdfs:range") val triplesRDFS3 = if (useSchemaBroadCasting) { otherTriples - .filter(new RichFilterFunction[RDFTriple]() { + .filter(new RichFilterFunction[Triple]() { - var broadcastSet: Traversable[RDFTriple] = _ + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple]("rangeTriples").asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple]("rangeTriples").asScala } - override def filter(t: RDFTriple): Boolean = - broadcastSet.exists(_.s == t.p) + override def filter(t: Triple): Boolean = + broadcastSet.exists(_.subjectMatches(t.getPredicate)) }) .withBroadcastSet(rangeTriples, "rangeTriples") - .flatMap(new RichFlatMapFunction[RDFTriple, RDFTriple]() { - var broadcastSet: Traversable[RDFTriple] = _ + .flatMap(new RichFlatMapFunction[Triple, Triple]() { + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple]("rangeTriples").asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple]("rangeTriples").asScala } - override def flatMap(in: RDFTriple, collector: Collector[RDFTriple]): Unit = { + override def flatMap(in: Triple, collector: Collector[Triple]): Unit = { broadcastSet - .filter(_.s == in.p) - .foreach(t => collector.collect(RDFTriple(in.o, RDF.`type`.getURI, t.o))) + .filter(_.subjectMatches(in.getPredicate)) + .foreach(t => collector.collect(Triple.create(in.getObject, RDF.`type`.asNode(), t.getObject))) } }) .withBroadcastSet(rangeTriples, "rangeTriples") } else { - val rangeMap = rangeTriples.map(t => (t.s, t.o)).collect().toMap + val rangeMap = rangeTriples.map(t => (t.getSubject, t.getObject)).collect().toMap otherTriples - .filter(t => rangeMap.contains(t.p)) - .map(t => RDFTriple(t.o, RDF.`type`.getURI, rangeMap(t.p))) + .filter(t => rangeMap.contains(t.getPredicate)) + .map(t => Triple.create(t.getObject, RDF.`type`.asNode(), rangeMap(t.getPredicate))) }.name("rdfs3") @@ -226,42 +226,42 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas val triplesRDFS9 = if (useSchemaBroadCasting) { typeTriples // all rdf:type triples (s a A) // .filter(new SubClassOfFilterFunction("subClasses")).withBroadcastSet(subClassOfTriplesTrans, "subClasses") // such that A has a super class B - .filter(new RichFilterFunction[RDFTriple]() { + .filter(new RichFilterFunction[Triple]() { - var broadcastSet: Traversable[RDFTriple] = _ + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple]("subClassTriples").asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple]("subClassTriples").asScala } - override def filter(t: RDFTriple): Boolean = - broadcastSet.exists(_.s == t.o) + override def filter(t: Triple): Boolean = + broadcastSet.exists(_.subjectMatches(t.getObject)) }) .withBroadcastSet(subClassOfTriplesTrans, "subClassTriples") // .flatMap(new SubClassOfFlatMapFunction("subClasses")).withBroadcastSet(subClassOfTriplesTrans, "subClasses") // create triple (s a B) - .flatMap(new RichFlatMapFunction[RDFTriple, RDFTriple]() { - var broadcastSet: Traversable[RDFTriple] = _ + .flatMap(new RichFlatMapFunction[Triple, Triple]() { + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple]("subClassTriples").asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple]("subClassTriples").asScala } - override def flatMap(in: RDFTriple, collector: Collector[RDFTriple]): Unit = { + override def flatMap(in: Triple, collector: Collector[Triple]): Unit = { broadcastSet - .filter(t => in.o == t.s) - .foreach(t => collector.collect(RDFTriple(in.s, in.p, t.o))) + .filter(t => in.objectMatches(t.getSubject)) + .foreach(t => collector.collect(Triple.create(in.getSubject, in.getPredicate, t.getObject))) } }) .withBroadcastSet(subClassOfTriplesTrans, "subClassTriples") } else { - val subClassOfMap = CollectionUtils.toMultiMap(subClassOfTriplesTrans.map(t => (t.s, t.o)).collect) + val subClassOfMap = CollectionUtils.toMultiMap(subClassOfTriplesTrans.map(t => (t.getSubject, t.getObject)).collect) typeTriples // all rdf:type triples (s a A) - .filter(t => subClassOfMap.contains(t.o)) // such that A has a super class B + .filter(t => subClassOfMap.contains(t.getObject)) // such that A has a super class B .flatMap( t => - subClassOfMap(t.o) - .map(supCls => RDFTriple(t.s, RDF.`type`.getURI, supCls)) + subClassOfMap(t.getObject) + .map(supCls => Triple.create(t.getSubject, RDF.`type`.asNode(), supCls)) ) // create triple (s a B) }.name("rdfs9") @@ -284,39 +284,39 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas .flatMap( t => Set( - RDFTriple(t.s, RDF.`type`.getURI, RDFS.Resource.getURI), - RDFTriple(t.o, RDF.`type`.getURI, RDFS.Resource.getURI) - // RDFTriple(t.predicate, RDF.`type`.getURI, RDF.Property.getURI) + Triple.create(t.getSubject, RDF.`type`.asNode(), RDFS.Resource.asNode()), + Triple.create(t.getObject, RDF.`type`.asNode(), RDFS.Resource.asNode()) + // Triple(t.predicate, RDF.`type`.getURI, RDF.Property.getURI) ) ) .name("rdfs4") // rdfs12: (?x rdf:type rdfs:ContainerMembershipProperty) -> (?x rdfs:subPropertyOf rdfs:member) val rdfs12 = typeTriples - .filter(t => t.o == RDFS.ContainerMembershipProperty.getURI) - .map(t => RDFTriple(t.s, RDF.`type`.getURI, RDFS.member.getURI)) + .filter(t => t.objectMatches(RDFS.ContainerMembershipProperty.asNode())) + .map(t => Triple.create(t.getSubject, RDF.`type`.asNode(), RDFS.member.asNode())) .name("rdfs12") // rdfs6: (p rdf:type rdf:Property) => (p rdfs:subPropertyOf p) val rdfs6 = typeTriples - .filter(t => t.o == RDF.Property.getURI) - .map(t => RDFTriple(t.s, RDFS.subPropertyOf.getURI, t.s)) + .filter(t => t.objectMatches(RDF.Property.asNode())) + .map(t => Triple.create(t.getSubject, RDFS.subPropertyOf.asNode(), t.getSubject)) .name("rdfs6") // rdfs8: (s rdf:type rdfs:Class ) => (s rdfs:subClassOf rdfs:Resource) // rdfs10: (s rdf:type rdfs:Class) => (s rdfs:subClassOf s) val rdfs8_10 = typeTriples - .filter(t => t.o == RDFS.Class.getURI) + .filter(t => t.objectMatches(RDFS.Class.asNode())) .flatMap( t => Set( - RDFTriple(t.s, RDFS.subClassOf.getURI, RDFS.Resource.getURI), - RDFTriple(t.s, RDFS.subClassOf.getURI, t.s) + Triple.create(t.getSubject, RDFS.subClassOf.asNode(), RDFS.Resource.asNode()), + Triple.create(t.getSubject, RDFS.subClassOf.asNode(), t.getSubject) ) ) .name("rdfs8/rdfs10") - val additionalTripleRDDs = mutable.Seq(rdfs4, rdfs6, rdfs8_10) + val additionalTripleRDDs = mutable.Seq(rdfs4, rdfs6, rdfs8_10, rdfs12) allTriples = env.union(Seq(allTriples) ++ additionalTripleRDDs).distinct() } @@ -332,43 +332,43 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas RDFGraph(allTriples) } - object SchemaTriplesFilter extends ((RDFTriple) => Boolean) with Serializable { + object SchemaTriplesFilter extends (Triple => Boolean) with Serializable { - val schemaPredicates = - Set(RDFS.subClassOf.getURI, RDFS.subPropertyOf.getURI, RDFS.domain.getURI, RDFS.range.getURI) + private val schemaPredicates = + Set(RDFS.subClassOf, RDFS.subPropertyOf, RDFS.domain, RDFS.range).map(_.asNode()) - override def apply(t: RDFTriple): Boolean = schemaPredicates.contains(t.p) + override def apply(t: Triple): Boolean = schemaPredicates.contains(t.getPredicate) } - private def extractSchemaTriples(triples: DataSet[RDFTriple]): DataSet[RDFTriple] = { + private def extractSchemaTriples(triples: DataSet[Triple]): DataSet[Triple] = { triples.filter(SchemaTriplesFilter).name("schemaTriples") } - class SubClassOfFilterFunction(predicate: String) extends RichFilterFunction[RDFTriple]() { + class SubClassOfFilterFunction(predicate: String) extends RichFilterFunction[Triple]() { - var broadcastSet: Traversable[RDFTriple] = _ + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { // Access the broadcasted DataSet as a Collection - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple](predicate).asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple](predicate).asScala } - override def filter(t: RDFTriple): Boolean = - broadcastSet.exists(_.s == t.o) + override def filter(t: Triple): Boolean = + broadcastSet.exists(_.subjectMatches(t.getObject)) } - class SubClassOfFlatMapFunction(predicate: String) extends RichFlatMapFunction[RDFTriple, RDFTriple]() { - var broadcastSet: Traversable[RDFTriple] = _ + class SubClassOfFlatMapFunction(predicate: String) extends RichFlatMapFunction[Triple, Triple]() { + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { // Access the broadcasted DataSet as a Collection - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple](predicate).asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple](predicate).asScala } - override def flatMap(in: RDFTriple, collector: Collector[RDFTriple]): Unit = { + override def flatMap(in: Triple, collector: Collector[Triple]): Unit = { broadcastSet - .filter(t => in.o == t.s) - .foreach(t => collector.collect(RDFTriple(in.s, in.p, t.o))) + .filter(t => in.objectMatches(t.getSubject)) + .foreach(t => collector.collect(Triple.create(in.getSubject, in.getPredicate, t.getObject))) } } diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/TransitiveReasoner.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/TransitiveReasoner.scala index c2f2765..8fda1fd 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/TransitiveReasoner.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/TransitiveReasoner.scala @@ -5,8 +5,8 @@ import scala.reflect.ClassTag import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.scala.{DataSet, _} import org.apache.flink.util.Collector +import org.apache.jena.graph.{Node, Triple} -import net.sansa_stack.inference.data.RDFTriple import net.sansa_stack.inference.utils.Profiler /** @@ -29,7 +29,7 @@ trait TransitiveReasoner extends Profiler{ * @param triples the set of triples * @return a set containing the transitive closure of the triples */ - def computeTransitiveClosure(triples: Set[RDFTriple]): Set[RDFTriple] = { + def computeTransitiveClosure(triples: Set[Triple]): Set[Triple] = { val tc = addTransitive(triples) // recursive call if set changed, otherwise stop and return if (tc.size == triples.size) triples else computeTransitiveClosure(tc) @@ -39,8 +39,9 @@ trait TransitiveReasoner extends Profiler{ // s ++ (for ((s1, p1, o1) <- s; (s2, p2, o2) <- s if o1 == s2) yield (s1, p1, o2)) // } - def addTransitive(triples: Set[RDFTriple]): Set[RDFTriple] = { - triples ++ (for (t1 <- triples; t2 <- triples if t1.o == t2.s) yield RDFTriple(t1.s, t1.p, t2.o)) + def addTransitive(triples: Set[Triple]): Set[Triple] = { + triples ++ (for (t1 <- triples; t2 <- triples if t1.objectMatches(t2.getSubject)) + yield Triple.create(t1.getSubject, t1.getPredicate, t2.getObject)) } /** @@ -50,16 +51,16 @@ trait TransitiveReasoner extends Profiler{ * @param triples the DataSet of triples * @return a DataSet containing the transitive closure of the triples */ - def computeTransitiveClosure(triples: DataSet[RDFTriple]): DataSet[RDFTriple] = { + def computeTransitiveClosure(triples: DataSet[Triple]): DataSet[Triple] = { if (triples.count() == 0) return triples log.info("computing TC...") profile { // keep the predicate - val predicate = triples.first(1).collect().head.p + val predicate = triples.first(1).collect().head.getPredicate // compute the TC - var subjectObjectPairs = triples.map(t => (t.s, t.o)) + var subjectObjectPairs = triples.map(t => (t.getSubject, t.getObject)) // because join() joins on keys, in addition the pairs are stored in reversed order (o, s) val objectSubjectPairs = subjectObjectPairs.map(t => (t._2, t._1)) @@ -86,7 +87,7 @@ trait TransitiveReasoner extends Profiler{ } while (nextCount != oldCount) log.info("TC has " + nextCount + " triples.") - subjectObjectPairs.map(p => RDFTriple(p._1, predicate, p._2)) + subjectObjectPairs.map(p => Triple.create(p._1, predicate, p._2)) } } @@ -99,20 +100,20 @@ trait TransitiveReasoner extends Profiler{ * @param triples the DataSet of triples * @return a DataSet containing the transitive closure of the triples */ - def computeTransitiveClosureOpt(triples: DataSet[RDFTriple]): DataSet[RDFTriple] = { + def computeTransitiveClosureOpt(triples: DataSet[Triple]): DataSet[Triple] = { if (triples.count() == 0) return triples log.info("computing TC...") profile { // keep the predicate - val predicate = triples.first(1).collect().head.p + val predicate = triples.first(1).collect().head.getPredicate // convert to tuples needed for the JOIN operator - val subjectObjectPairs = triples.map(t => (t.s, t.o)) + val subjectObjectPairs = triples.map(t => (t.getSubject, t.getObject)) // compute the TC val res = subjectObjectPairs.iterateWithTermination(10) { - prevPaths: DataSet[(String, String)] => + prevPaths: DataSet[(Node, Node)] => val nextPaths = prevPaths .join(subjectObjectPairs).where(1).equalTo(0) { @@ -125,7 +126,7 @@ trait TransitiveReasoner extends Profiler{ val terminate = prevPaths .coGroup(nextPaths) .where(0).equalTo(0) { - (prev, next, out: Collector[(String, String)]) => { + (prev, next, out: Collector[(Node, Node)]) => { val prevPaths = prev.toSet for (n <- next) if (!prevPaths.contains(n)) out.collect(n) @@ -135,7 +136,7 @@ trait TransitiveReasoner extends Profiler{ } // map back to RDF triples - res.map(p => RDFTriple(p._1, predicate, p._2)) + res.map(p => Triple.create(p._1, predicate, p._2)) } } @@ -184,15 +185,15 @@ trait TransitiveReasoner extends Profiler{ * @param triples the DataSet of triples * @return a DataSet containing the transitive closure of the triples */ - def computeTransitiveClosureOptSemiNaive(triples: DataSet[RDFTriple]): DataSet[RDFTriple] = { + def computeTransitiveClosureOptSemiNaive(triples: DataSet[Triple]): DataSet[Triple] = { log.info("computing TC...") - def iterate(s: DataSet[RDFTriple], ws: DataSet[RDFTriple]): (DataSet[RDFTriple], DataSet[RDFTriple]) = { + def iterate(s: DataSet[Triple], ws: DataSet[Triple]): (DataSet[Triple], DataSet[Triple]) = { val resolvedRedirects = triples.join(ws) - .where { _.s } - .equalTo { _.o } + .where { _.getSubject } + .equalTo { _.getObject } .map { joinResult => joinResult match { case (redirect, link) => - RDFTriple(link.s, redirect.p, redirect.o) + Triple.create(link.getSubject, redirect.getPredicate, redirect.getObject) } }.name("TC-From-Iteration") (resolvedRedirects, resolvedRedirects) From 2df2c0cc210335e1c0e088cab67852fec62a8795 Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Wed, 27 Jun 2018 12:55:51 +0200 Subject: [PATCH 02/23] Cont. Flink RDF --- pom.xml | 5 ++ sansa-inference-flink/pom.xml | 6 ++ .../flink/RDFGraphMaterializer.scala | 7 +-- .../inference/flink/data/RDFGraph.scala | 4 +- .../inference/flink/data/RDFGraphLoader.scala | 61 +++++++++++-------- .../inference/flink/data/RDFGraphWriter.scala | 15 ++--- .../inference/flink/RDFGraphTestCase.scala | 39 ++++++------ .../conformance/OWLHorstConformanceTest.scala | 19 +++--- .../conformance/RDFSConformanceTest.scala | 21 ++++--- .../flink/rules/TransitivityRuleTest.scala | 23 ++++--- 10 files changed, 110 insertions(+), 90 deletions(-) diff --git a/pom.xml b/pom.xml index 98d3242..1624b56 100644 --- a/pom.xml +++ b/pom.xml @@ -111,6 +111,11 @@ sansa-rdf-partition-core ${sansa.rdf.version} + + net.sansa-stack + sansa-rdf-flink_${scala.binary.version} + ${sansa.rdf.version} + diff --git a/sansa-inference-flink/pom.xml b/sansa-inference-flink/pom.xml index 5eb0fee..061007f 100644 --- a/sansa-inference-flink/pom.xml +++ b/sansa-inference-flink/pom.xml @@ -47,6 +47,12 @@ under the License. test + + + net.sansa-stack + sansa-rdf-flink_${scala.binary.version} + + org.apache.flink diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/RDFGraphMaterializer.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/RDFGraphMaterializer.scala index c265dfd..43797a1 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/RDFGraphMaterializer.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/RDFGraphMaterializer.scala @@ -74,7 +74,6 @@ object RDFGraphMaterializer { // load triples from disk val graph = RDFGraphLoader.loadFromDisk(input, env) -// println(s"|G| = ${graph.size()}") // create reasoner val reasoner = profile match { @@ -90,7 +89,7 @@ object RDFGraphMaterializer { // compute inferred graph val inferredGraph = reasoner.apply(graph) - println(s"|G_inf| = ${inferredGraph.size()}") + println(s"|G_inf| = ${inferredGraph.size}") // write triples to disk // RDFGraphWriter.writeToDisk(inferredGraph, output, writeToSingleFile, sortedOutput) @@ -119,7 +118,7 @@ object RDFGraphMaterializer { // the CLI parser val parser = new scopt.OptionParser[Config]("RDFGraphMaterializer") { - head("RDFGraphMaterializer", "0.1.0") + head("RDFGraphMaterializer", "0.4.0") // opt[Seq[File]]('i', "input").required().valueName(",,..."). // action((x, c) => c.copy(in = x)). @@ -128,7 +127,7 @@ object RDFGraphMaterializer { .required() .valueName("") .action((x, c) => c.copy(in = x)) - .text("path to file or directory that contains the input files (in N-Triple format)") + .text("path to file or directory that contains the input files (in N-Triples format)") opt[URI]('o', "out") .required() diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala index b8c388d..2ae0099 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala @@ -67,7 +67,5 @@ case class RDFGraph(triples: DataSet[Triple]) { * * @return the number of triples */ - def size(): Long = { - triples.count() - } + lazy val size: Long = triples.count() } diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala index e631d4a..3349929 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala @@ -1,17 +1,15 @@ package net.sansa_stack.inference.flink.data -import java.io.File import java.net.URI -import org.apache.flink.api.scala.{ExecutionEnvironment, _} - -import net.sansa_stack.inference.data.RDFTriple -import org.apache.flink.configuration.Configuration +import scala.collection.JavaConverters._ import scala.language.implicitConversions -import org.apache.jena.rdf.model.impl.NTripleReader +import org.apache.flink.api.scala.{ExecutionEnvironment, _} +import org.apache.jena.riot.{Lang, RDFDataMgr} + +import net.sansa_stack.rdf.benchmark.io.ReadableByteChannelFromIterator -import net.sansa_stack.inference.utils.NTriplesStringToRDFTriple /** * @author Lorenz Buehmann @@ -20,41 +18,50 @@ object RDFGraphLoader { implicit def pathURIsConverter(uris: Seq[URI]): String = uris.map(p => p.toString).mkString(",") - def loadFromFile(path: String, env: ExecutionEnvironment): RDFGraph = { - val triples = env.readTextFile(path) - .map(line => line.replace(">", "").replace("<", "").split("\\s+")) // line to tokens - .map(tokens => RDFTriple(tokens(0), tokens(1), tokens(2))) // tokens to triple - RDFGraph(triples) + def loadFromDisk(path: String, env: ExecutionEnvironment): RDFGraph = { + loadFromDisk(URI.create(path), env) } def loadFromDisk(path: URI, env: ExecutionEnvironment): RDFGraph = { - // create a configuration object - val parameters = new Configuration + loadFromDisk(Seq(path), env) + } + + def loadFromDisk(paths: Seq[URI], env: ExecutionEnvironment): RDFGraph = { +// // create a configuration object +// val parameters = new Configuration +// +// // set the recursive enumeration parameter +// parameters.setBoolean("recursive.file.enumeration", true) +// env.readTextFile(f).withParameters(parameters) - // set the recursive enumeration parameter - parameters.setBoolean("recursive.file.enumeration", true) + val tmp: List[String] = paths.map(path => path.toString).toList + val triples = tmp + .map(f => env.readTextFile(f)) // no support to read from multiple paths at once, thus, map + union here + .reduce(_ union _) // TODO Flink 1.5.0 supports multiple paths via FileInputFormat + .mapPartition(p => { + // convert iterator to input stream + val is = ReadableByteChannelFromIterator.toInputStream(p.asJava) - // pass the configuration to the data source - val triples = env.readTextFile(path.toString).withParameters(parameters) - .map(line => line.replace(">", "").replace("<", "").split("\\s+")) // line to tokens - .map(tokens => RDFTriple(tokens(0), tokens(1), tokens(2))) - .name("triples") // tokens to triple + RDFDataMgr.createIteratorTriples(is, Lang.NTRIPLES, null).asScala + }) + .name("triples") RDFGraph(triples) } - def loadFromDisk(paths: Seq[URI], env: ExecutionEnvironment): RDFGraph = { + def main(args: Array[String]): Unit = { + if (args.length == 0) println("Usage: RDFGraphLoader ") - val tmp: List[String] = paths.map(path => path.toString).toList + val path = args(0) - val converter = new NTriplesStringToRDFTriple() + val env = ExecutionEnvironment.getExecutionEnvironment - val triples = tmp - .map(f => env.readTextFile(f).flatMap(line => converter.apply(line))).reduce(_ union _).name("triples") + val ds = RDFGraphLoader.loadFromDisk(path, env).triples - RDFGraph(triples) + println(s"size:${ds.count}") + println("sample data:\n" + ds.first(10).map { _.toString.replaceAll("[\\x00-\\x1f]","???")}.collect().mkString("\n")) } } diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphWriter.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphWriter.scala index f421f72..ed085b8 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphWriter.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphWriter.scala @@ -1,6 +1,6 @@ package net.sansa_stack.inference.flink.data -import java.io.{ByteArrayInputStream, File} +import java.io.ByteArrayInputStream import java.net.URI import java.nio.charset.StandardCharsets @@ -8,10 +8,11 @@ import org.apache.flink.api.common.operators.Order import org.apache.flink.api.scala._ import org.apache.flink.core.fs.FileSystem import org.apache.jena.rdf.model.{Model, ModelFactory} - -import net.sansa_stack.inference.utils.{RDFTripleOrdering, RDFTripleToNTripleString} +import org.apache.jena.sparql.util.TripleComparator import org.slf4j.LoggerFactory +import net.sansa_stack.inference.utils.{JenaTripleToNTripleString, RDFTripleOrdering} + /** * Writes an RDF graph to disk. * @@ -26,10 +27,10 @@ object RDFGraphWriter { logger.info("writing triples to disk...") val startTime = System.currentTimeMillis() - implicit val ordering = RDFTripleOrdering + implicit val ordering = new TripleComparator() graph.triples.map(t => (t, t)).sortPartition(1, Order.DESCENDING).map(_._1) - .map(new RDFTripleToNTripleString()) // to N-TRIPLES string + .map(new JenaTripleToNTripleString()) // to N-Triples string .writeAsText(path, writeMode = FileSystem.WriteMode.OVERWRITE) logger.info("finished writing triples to disk in " + (System.currentTimeMillis()-startTime) + "ms.") @@ -61,14 +62,14 @@ object RDFGraphWriter { } tmp - .map(new RDFTripleToNTripleString()) // to N-TRIPLES string + .map(new JenaTripleToNTripleString()) // to N-TRIPLES string .writeAsText(path.toString, writeMode = FileSystem.WriteMode.OVERWRITE) logger.info("finished writing triples to disk in " + (System.currentTimeMillis()-startTime) + "ms.") } def convertToModel(graph: RDFGraph) : Model = { - val modelString = graph.triples.map(new RDFTripleToNTripleString()) + val modelString = graph.triples.map(new JenaTripleToNTripleString()) .collect().mkString("\n") val model = ModelFactory.createDefaultModel() diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/RDFGraphTestCase.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/RDFGraphTestCase.scala index 04e2f5b..e111e6d 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/RDFGraphTestCase.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/RDFGraphTestCase.scala @@ -3,17 +3,20 @@ package net.sansa_stack.inference.flink import java.util import java.util.Comparator +import scala.collection.JavaConverters._ + import com.google.common.collect.ComparisonChain -import net.sansa_stack.inference.flink.data.RDFGraph import org.apache.flink.api.scala.{ExecutionEnvironment, _} import org.apache.flink.test.util.MultipleProgramsTestBase.TestExecutionMode import org.apache.flink.test.util.{MultipleProgramsTestBase, TestBaseUtils} +import org.apache.jena.graph.{NodeFactory, Triple} +import org.apache.jena.sparql.util.TripleComparator import org.junit.Test import org.junit.runner.RunWith import org.junit.runners.Parameterized -import net.sansa_stack.inference.data.RDFTriple -import scala.collection.JavaConverters._ +import net.sansa_stack.inference.data.RDFTriple +import net.sansa_stack.inference.flink.data.RDFGraph /** * A test case for the computation of the transitive closure (TC). @@ -26,19 +29,24 @@ class RDFGraphTestCase(mode: TestExecutionMode) extends MultipleProgramsTestBase def testSubtract(): Unit = { val env = ExecutionEnvironment.getExecutionEnvironment + val s1 = NodeFactory.createURI("s1") + val p1 = NodeFactory.createURI("p1") + val o1 = NodeFactory.createURI("o1") + val o2 = NodeFactory.createURI("o2") + val o3 = NodeFactory.createURI("o3") // generate dataset val g1 = RDFGraph(env.fromCollection( Seq( - RDFTriple("s1", "p1", "o1"), - RDFTriple("s1", "p1", "o2"), - RDFTriple("s1", "p1", "o3") + Triple.create(s1, p1, o1), + Triple.create(s1, p1, o2), + Triple.create(s1, p1, o3) ) )) val g2 = RDFGraph(env.fromCollection( Seq( - RDFTriple("s1", "p1", "o1"), - RDFTriple("s1", "p1", "o2") + Triple.create(s1, p1, o1), + Triple.create(s1, p1, o2) ) )) @@ -47,17 +55,12 @@ class RDFGraphTestCase(mode: TestExecutionMode) extends MultipleProgramsTestBase val result = g_diff.triples.collect() val expected = Seq( - RDFTriple("s1", "p1", "o3") + Triple.create(s1, p1, o3) ) - TestBaseUtils.compareResultCollections(new util.ArrayList(result.asJava), new util.ArrayList(expected.asJava), new Comparator[RDFTriple] { - override def compare(t1: RDFTriple, t2: RDFTriple): Int = - ComparisonChain.start() - .compare(t1.s, t2.s) - .compare(t1.p, t2.p) - .compare(t1.o, t2.o) - .result() - }) + TestBaseUtils.compareResultCollections( + new util.ArrayList(result.asJava), + new util.ArrayList(expected.asJava), + new TripleComparator()) } - } diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/OWLHorstConformanceTest.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/OWLHorstConformanceTest.scala index fb7987b..81570ce 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/OWLHorstConformanceTest.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/OWLHorstConformanceTest.scala @@ -1,14 +1,14 @@ package net.sansa_stack.inference.flink.conformance -import net.sansa_stack.inference.flink.data.RDFGraphWriter -import net.sansa_stack.test.conformance.{IntegrationTestSuite, OWLHorstConformanceTestBase} +import scala.collection.mutable + import org.apache.flink.api.scala._ +import org.apache.jena.graph.Triple import org.apache.jena.rdf.model.Model -import net.sansa_stack.inference.data.{RDFTriple, SimpleRDFOps} -import net.sansa_stack.inference.flink.data.{RDFGraph, RDFGraphWriter} -import org.scalatest.Ignore -import scala.collection.mutable +import net.sansa_stack.inference.data.{Jena, JenaOps} +import net.sansa_stack.inference.flink.data.{RDFGraph, RDFGraphWriter} +import net.sansa_stack.test.conformance.OWLHorstConformanceTestBase /** * The class is to test the conformance of each materialization rule of OWL Horst entailment. @@ -16,10 +16,11 @@ import scala.collection.mutable * @author Lorenz Buehmann * */ -@IntegrationTestSuite -class OWLHorstConformanceTest extends OWLHorstConformanceTestBase(rdfOps = new SimpleRDFOps) with SharedOWLHorstReasonerContext{ +class OWLHorstConformanceTest + extends OWLHorstConformanceTestBase[Jena](rdfOps = new JenaOps) + with SharedOWLHorstReasonerContext{ - override def computeInferredModel(triples: mutable.HashSet[RDFTriple]): Model = { + override def computeInferredModel(triples: mutable.HashSet[Triple]): Model = { // distribute triples val triplesRDD = env.fromCollection(triples) diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/RDFSConformanceTest.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/RDFSConformanceTest.scala index cfa371d..fd7b3dc 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/RDFSConformanceTest.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/RDFSConformanceTest.scala @@ -1,14 +1,14 @@ package net.sansa_stack.inference.flink.conformance -import net.sansa_stack.inference.flink.data.RDFGraphWriter -import net.sansa_stack.test.conformance.{IntegrationTestSuite, RDFSConformanceTestBase} -import org.apache.jena.rdf.model.Model -import net.sansa_stack.inference.data.{RDFTriple, SimpleRDFOps} -import net.sansa_stack.inference.flink.data.RDFGraph +import scala.collection.mutable + import org.apache.flink.api.scala._ -import org.scalatest.Ignore +import org.apache.jena.graph.Triple +import org.apache.jena.rdf.model.Model -import scala.collection.mutable +import net.sansa_stack.inference.data.{Jena, JenaOps} +import net.sansa_stack.inference.flink.data.{RDFGraph, RDFGraphWriter} +import net.sansa_stack.test.conformance.RDFSConformanceTestBase /** * The class is to test the conformance of each materialization rule of RDFS(simple) entailment. @@ -16,10 +16,11 @@ import scala.collection.mutable * @author Lorenz Buehmann * */ -@IntegrationTestSuite -class RDFSConformanceTest extends RDFSConformanceTestBase(rdfOps = new SimpleRDFOps) with SharedRDFSReasonerContext{ +class RDFSConformanceTest + extends RDFSConformanceTestBase[Jena](rdfOps = new JenaOps) + with SharedRDFSReasonerContext{ - override def computeInferredModel(triples: mutable.HashSet[RDFTriple]): Model = { + override def computeInferredModel(triples: mutable.HashSet[Triple]): Model = { // distribute triples val triplesRDD = env.fromCollection(triples) diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/rules/TransitivityRuleTest.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/rules/TransitivityRuleTest.scala index 6e1940c..873d10f 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/rules/TransitivityRuleTest.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/rules/TransitivityRuleTest.scala @@ -1,13 +1,13 @@ package net.sansa_stack.inference.flink.rules -import net.sansa_stack.inference.flink.data.RDFGraphWriter -import net.sansa_stack.inference.flink.forwardchaining.ForwardRuleReasonerRDFS +import scala.collection.mutable + import org.apache.flink.api.scala.{ExecutionEnvironment, _} +import org.apache.jena.graph.{NodeFactory, Triple} import org.apache.jena.vocabulary.RDFS -import net.sansa_stack.inference.data.RDFTriple -import net.sansa_stack.inference.flink.data.{RDFGraph, RDFGraphWriter} -import scala.collection.mutable +import net.sansa_stack.inference.flink.data.{RDFGraph, RDFGraphWriter} +import net.sansa_stack.inference.flink.forwardchaining.ForwardRuleReasonerRDFS /** * A forward chaining implementation of the RDFS entailment regime. @@ -22,25 +22,24 @@ object TransitivityRuleTest { env.setParallelism(4) // generate graph - val triples = new mutable.HashSet[RDFTriple]() - val ns = "http://ex.org/" - val p1 = RDFS.subClassOf.getURI + val triples = new mutable.HashSet[Triple]() + val p1 = RDFS.subClassOf.asNode() val scale = 1 val begin = 1 val end = 10 * scale for(i <- begin to end) { - triples += RDFTriple(ns + "x" + i, p1, ns + "y" + i) - triples += RDFTriple(ns + "y" + i, p1, ns + "z" + i) - triples += RDFTriple(ns + "z" + i, p1, ns + "w" + i) + triples += Triple.create(NodeFactory.createURI("x" + i), p1, NodeFactory.createURI("y" + i)) + triples += Triple.create(NodeFactory.createURI("y" + i), p1, NodeFactory.createURI("z" + i)) + triples += Triple.create(NodeFactory.createURI("z" + i), p1, NodeFactory.createURI("w" + i)) } // graph is a path of length n // (x1, p, x2), (x2, p, x3), ..., (x(n-1), p, xn) val n = 10 for (i <- 1 to end) { - triples += RDFTriple(ns + "x" + i, p1, ns + "x" + (i + 1)) + triples += Triple.create(NodeFactory.createURI("x" + i), p1, NodeFactory.createURI("x" + (i + 1))) } val triplesDataset = env.fromCollection(triples) From 7d231ec36d1deb82d6532ac282c65a325749fa99 Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Fri, 9 Dec 2016 09:40:53 +0100 Subject: [PATCH 03/23] Minor Pom changes --- sansa-inference-spark/pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sansa-inference-spark/pom.xml b/sansa-inference-spark/pom.xml index 5a3257d..4d91413 100644 --- a/sansa-inference-spark/pom.xml +++ b/sansa-inference-spark/pom.xml @@ -360,6 +360,8 @@ + dist @@ -367,8 +369,6 @@ dist - From 272f0e7d0d74bb333dac67369abb76e43597c092 Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Tue, 26 Jun 2018 07:38:50 +0200 Subject: [PATCH 04/23] Flink reasoning with Jena datastructures. --- .../inference/flink/data/RDFGraph.scala | 23 ++- .../inference/flink/data/RDFGraphLoader.scala | 6 +- .../extraction/OWLHorstSchemaExtractor.scala | 4 +- .../extraction/RDFSSchemaExtractor.scala | 2 +- .../flink/extraction/SchemaExtractor.scala | 22 +-- .../forwardchaining/ForwardRuleReasoner.scala | 35 ++-- .../ForwardRuleReasonerOWLHorst.scala | 182 +++++++++--------- .../ForwardRuleReasonerRDFS.scala | 180 ++++++++--------- .../forwardchaining/TransitiveReasoner.scala | 39 ++-- 9 files changed, 253 insertions(+), 240 deletions(-) diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala index 66c64a8..b8c388d 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala @@ -1,9 +1,8 @@ package net.sansa_stack.inference.flink.data -import net.sansa_stack.inference.flink.utils.DataSetUtils import org.apache.flink.api.scala.{DataSet, _} -import org.apache.jena.graph.Triple -import net.sansa_stack.inference.data.RDFTriple +import org.apache.jena.graph.{Node, Triple} + import net.sansa_stack.inference.flink.utils.DataSetUtils.DataSetOps /** @@ -12,7 +11,7 @@ import net.sansa_stack.inference.flink.utils.DataSetUtils.DataSetOps * @author Lorenz Buehmann * */ -case class RDFGraph(triples: DataSet[RDFTriple]) { +case class RDFGraph(triples: DataSet[Triple]) { /** * Returns a DataSet of triples that match with the given input. @@ -22,11 +21,11 @@ case class RDFGraph(triples: DataSet[RDFTriple]) { * @param o the object * @return DataSet of triples */ - def find(s: Option[String] = None, p: Option[String] = None, o: Option[String] = None): DataSet[RDFTriple] = { + def find(s: Option[Node] = None, p: Option[Node] = None, o: Option[Node] = None): DataSet[Triple] = { triples.filter(t => - (s == None || t.s == s.get) && - (p == None || t.p == p.get) && - (o == None || t.o == o.get) + (s.isEmpty || t.subjectMatches(s.get)) && + (p.isEmpty || t.predicateMatches(p.get)) && + (o.isEmpty || t.objectMatches(o.get)) ) } @@ -35,11 +34,11 @@ case class RDFGraph(triples: DataSet[RDFTriple]) { * * @return DataSet of triples */ - def find(triple: Triple): DataSet[RDFTriple] = { + def find(triple: Triple): DataSet[Triple] = { find( - if (triple.getSubject.isVariable) None else Option(triple.getSubject.toString), - if (triple.getPredicate.isVariable) None else Option(triple.getPredicate.toString), - if (triple.getObject.isVariable) None else Option(triple.getObject.toString) + if (triple.getSubject.isVariable) None else Option(triple.getSubject), + if (triple.getPredicate.isVariable) None else Option(triple.getPredicate), + if (triple.getObject.isVariable) None else Option(triple.getObject) ) } diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala index a94d1ca..e631d4a 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala @@ -9,6 +9,8 @@ import net.sansa_stack.inference.data.RDFTriple import org.apache.flink.configuration.Configuration import scala.language.implicitConversions +import org.apache.jena.rdf.model.impl.NTripleReader + import net.sansa_stack.inference.utils.NTriplesStringToRDFTriple /** @@ -33,6 +35,7 @@ object RDFGraphLoader { // set the recursive enumeration parameter parameters.setBoolean("recursive.file.enumeration", true) + // pass the configuration to the data source val triples = env.readTextFile(path.toString).withParameters(parameters) .map(line => line.replace(">", "").replace("<", "").split("\\s+")) // line to tokens @@ -48,7 +51,8 @@ object RDFGraphLoader { val converter = new NTriplesStringToRDFTriple() - val triples = tmp.map(f => env.readTextFile(f).flatMap(line => converter.apply(line))).reduce(_ union _).name("triples") + val triples = tmp + .map(f => env.readTextFile(f).flatMap(line => converter.apply(line))).reduce(_ union _).name("triples") RDFGraph(triples) } diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/OWLHorstSchemaExtractor.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/OWLHorstSchemaExtractor.scala index 422deb5..065f8d0 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/OWLHorstSchemaExtractor.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/OWLHorstSchemaExtractor.scala @@ -42,12 +42,12 @@ class OWLHorstSchemaExtractor() OWL2.allValuesFrom, OWL2.hasValue, OWL2.onProperty - ).map(p => p.getURI) + ).map(p => p.asNode()) )( Set( OWL2.TransitiveProperty, OWL2.FunctionalProperty, OWL2.InverseFunctionalProperty, OWL2.SymmetricProperty - ).map(p => p.getURI) + ).map(p => p.asNode()) ) {} diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/RDFSSchemaExtractor.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/RDFSSchemaExtractor.scala index 945a0d8..086671c 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/RDFSSchemaExtractor.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/RDFSSchemaExtractor.scala @@ -15,7 +15,7 @@ import org.apache.jena.vocabulary.RDFS * @author Lorenz Buehmann */ class RDFSSchemaExtractor() - extends SchemaExtractor()(Set(RDFS.subClassOf, RDFS.subPropertyOf, RDFS.domain, RDFS.range).map(p => p.getURI))() {} + extends SchemaExtractor()(Set(RDFS.subClassOf, RDFS.subPropertyOf, RDFS.domain, RDFS.range).map(p => p.asNode()))() {} object RDFSSchemaExtractor { def apply: RDFSSchemaExtractor = new RDFSSchemaExtractor() diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/SchemaExtractor.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/SchemaExtractor.scala index 8b32c8e..10348a8 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/SchemaExtractor.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/extraction/SchemaExtractor.scala @@ -1,9 +1,8 @@ package net.sansa_stack.inference.flink.extraction import org.apache.flink.api.scala.DataSet -import org.apache.jena.vocabulary.RDFS +import org.apache.jena.graph.{Node, Triple} -import net.sansa_stack.inference.data.RDFTriple import net.sansa_stack.inference.flink.data.RDFGraph import net.sansa_stack.inference.utils.Logging @@ -11,16 +10,16 @@ import net.sansa_stack.inference.utils.Logging * @author Lorenz Buehmann */ abstract class SchemaExtractor - (subjects: Set[String] = Set()) - (predicates: Set[String] = Set()) - (objects: Set[String] = Set()) + (subjects: Set[Node] = Set()) + (predicates: Set[Node] = Set()) + (objects: Set[Node] = Set()) extends Logging with Serializable{ - val subjectsFilter: ((RDFTriple) => Boolean) = t => subjects.contains(t.s) - val predicatesFilter: ((RDFTriple) => Boolean) = t => predicates.contains(t.p) - val objectsFilter: ((RDFTriple) => Boolean) = t => objects.contains(t.o) + val subjectsFilter: ((Triple) => Boolean) = t => subjects.contains(t.getSubject) + val predicatesFilter: ((Triple) => Boolean) = t => predicates.contains(t.getPredicate) + val objectsFilter: ((Triple) => Boolean) = t => objects.contains(t.getObject) - private def or(ps: (RDFTriple => Boolean)*) = (a: RDFTriple) => ps.exists(_(a)) + private def or(ps: (Triple => Boolean)*) = (a: Triple) => ps.exists(_(a)) /** * Extract a graph that contains only the schema triples. @@ -28,8 +27,7 @@ abstract class SchemaExtractor * @param graph the graph * @return a graph containing only the schema triples */ - def extract(graph: RDFGraph): RDFGraph = - new RDFGraph(extract(graph.triples)) + def extract(graph: RDFGraph): RDFGraph = RDFGraph(extract(graph.triples)) /** * Extract a DataSet that contains only the schema triples. @@ -37,7 +35,7 @@ abstract class SchemaExtractor * @param triples the triples * @return the schema triples */ - def extract(triples: DataSet[RDFTriple]): DataSet[RDFTriple] = + def extract(triples: DataSet[Triple]): DataSet[Triple] = triples .filter(or(subjectsFilter, predicatesFilter, objectsFilter)) .name("schema-triples") diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasoner.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasoner.scala index c5bda4d..45e5cca 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasoner.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasoner.scala @@ -1,11 +1,11 @@ package net.sansa_stack.inference.flink.forwardchaining -import net.sansa_stack.inference.data.RDFTriple import net.sansa_stack.inference.flink.data.RDFGraph import org.apache.flink.api.scala.DataSet - import scala.collection.mutable +import org.apache.jena.graph.{Node, Triple} + /** * A forward chaining based reasoner. * @@ -22,6 +22,15 @@ trait ForwardRuleReasoner extends TransitiveReasoner{ */ def apply(graph: RDFGraph) : RDFGraph + /** + * Applies forward chaining to the given set of RDF triples and returns a new set of RDF triples that + * contains all additional triples based on the underlying set of rules. + * + * @param triples the RDF triples + * @return the materialized RDF triples + */ + def apply(triples: DataSet[Triple]) : DataSet[Triple] = apply(RDFGraph(triples)).triples + /** * Extracts all triples for the given predicate. * @@ -29,8 +38,8 @@ trait ForwardRuleReasoner extends TransitiveReasoner{ * @param predicate the predicate * @return the set of triples that contain the predicate */ - def extractTriples(triples: mutable.Set[RDFTriple], predicate: String): mutable.Set[RDFTriple] = { - triples.filter(triple => triple.p == predicate) + def extractTriples(triples: mutable.Set[Triple], predicate: Node): mutable.Set[Triple] = { + triples.filter(triple => triple.predicateMatches(predicate)) } /** @@ -40,8 +49,8 @@ trait ForwardRuleReasoner extends TransitiveReasoner{ * @param predicate the predicate * @return the DataSet of triples that contain the predicate */ - def extractTriples(triples: DataSet[RDFTriple], predicate: String): DataSet[RDFTriple] = { - triples.filter(triple => triple.p == predicate) + def extractTriples(triples: DataSet[Triple], predicate: Node): DataSet[Triple] = { + triples.filter(triple => triple.predicateMatches(predicate)) } /** @@ -53,22 +62,22 @@ trait ForwardRuleReasoner extends TransitiveReasoner{ * @param obj the object * @return the DataSet of triples that match */ - def extractTriples(triples: DataSet[RDFTriple], - subject: Option[String], - predicate: Option[String], - obj: Option[String]): DataSet[RDFTriple] = { + def extractTriples(triples: DataSet[Triple], + subject: Option[Node], + predicate: Option[Node], + obj: Option[Node]): DataSet[Triple] = { var extractedTriples = triples if(subject.isDefined) { - extractedTriples = extractedTriples.filter(triple => triple.s == subject.get) + extractedTriples = extractedTriples.filter(triple => triple.subjectMatches(subject.get)) } if(predicate.isDefined) { - extractedTriples = extractedTriples.filter(triple => triple.p == predicate.get) + extractedTriples = extractedTriples.filter(triple => triple.predicateMatches(predicate.get)) } if(obj.isDefined) { - extractedTriples = extractedTriples.filter(triple => triple.o == obj.get) + extractedTriples = extractedTriples.filter(triple => triple.objectMatches(obj.get)) } extractedTriples diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerOWLHorst.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerOWLHorst.scala index 68db1f1..2093e52 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerOWLHorst.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerOWLHorst.scala @@ -1,12 +1,13 @@ package net.sansa_stack.inference.flink.forwardchaining -import net.sansa_stack.inference.flink.data.RDFGraph import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment, _} +import org.apache.jena.graph.{Node, Triple} import org.apache.jena.vocabulary.{OWL2, RDF, RDFS} -import net.sansa_stack.inference.data.RDFTriple -import net.sansa_stack.inference.utils.CollectionUtils import org.slf4j.LoggerFactory +import net.sansa_stack.inference.flink.data.RDFGraph +import net.sansa_stack.inference.utils.CollectionUtils + /** * A forward chaining implementation of the OWL Horst entailment regime. * @@ -26,26 +27,26 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // extract the schema data - var subClassOfTriples = extractTriples(triplesRDD, RDFS.subClassOf.getURI) // rdfs:subClassOf - var subPropertyOfTriples = extractTriples(triplesRDD, RDFS.subPropertyOf.getURI) // rdfs:subPropertyOf - val domainTriples = extractTriples(triplesRDD, RDFS.domain.getURI) // rdfs:domain - val rangeTriples = extractTriples(triplesRDD, RDFS.range.getURI) // rdfs:range - val equivClassTriples = extractTriples(triplesRDD, OWL2.equivalentClass.getURI) // owl:equivalentClass - val equivPropertyTriples = extractTriples(triplesRDD, OWL2.equivalentProperty.getURI) // owl:equivalentProperty + var subClassOfTriples = extractTriples(triplesRDD, RDFS.subClassOf.asNode()) // rdfs:subClassOf + var subPropertyOfTriples = extractTriples(triplesRDD, RDFS.subPropertyOf.asNode()) // rdfs:subPropertyOf + val domainTriples = extractTriples(triplesRDD, RDFS.domain.asNode()) // rdfs:domain + val rangeTriples = extractTriples(triplesRDD, RDFS.range.asNode()) // rdfs:range + val equivClassTriples = extractTriples(triplesRDD, OWL2.equivalentClass.asNode()) // owl:equivalentClass + val equivPropertyTriples = extractTriples(triplesRDD, OWL2.equivalentProperty.asNode()) // owl:equivalentProperty // 1. we have to process owl:equivalentClass and owl:equivalentProperty before computing the transitive closure // rdfp12a: (?C owl:equivalentClass ?D) -> (?C rdfs:subClassOf ?D ) - val tmp_12a = equivClassTriples.map(t => RDFTriple(t.s, RDFS.subClassOf.getURI, t.o)) + val tmp_12a = equivClassTriples.map(t => Triple.create(t.getSubject, RDFS.subClassOf.asNode(), t.getObject)) // rdfp12b: (?C owl:equivalentClass ?D) -> (?D rdfs:subClassOf ?C ) - val tmp_12b = equivClassTriples.map(t => RDFTriple(t.o, RDFS.subClassOf.getURI, t.s)) + val tmp_12b = equivClassTriples.map(t => Triple.create(t.getObject, RDFS.subClassOf.asNode(), t.getSubject)) subClassOfTriples = env.union(Seq(subClassOfTriples, tmp_12a, tmp_12b)) .distinct() // rdfp13a: (?C owl:equivalentProperty ?D) -> (?C rdfs:subPropertyOf ?D ) - val tmp_13a = equivPropertyTriples.map(t => RDFTriple(t.s, RDFS.subPropertyOf.getURI, t.o)) + val tmp_13a = equivPropertyTriples.map(t => Triple.create(t.getSubject, RDFS.subPropertyOf.asNode(), t.getObject)) // rdfp13b: (?C owl:equivalentProperty ?D) -> (?D rdfs:subPropertyOf ?C ) - val tmp_13b = equivPropertyTriples.map(t => RDFTriple(t.o, RDFS.subPropertyOf.getURI, t.s)) + val tmp_13b = equivPropertyTriples.map(t => Triple.create(t.getObject, RDFS.subPropertyOf.asNode(), t.getSubject)) subPropertyOfTriples = env.union(Seq(subPropertyOfTriples, tmp_13a, tmp_13b)) .distinct() @@ -57,10 +58,10 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // we put all into maps which should be more efficient later on - val subClassOfMap = CollectionUtils.toMultiMap(subClassOfTriplesTrans.map(t => (t.s, t.o)).collect) - val subPropertyMap = CollectionUtils.toMultiMap(subPropertyOfTriplesTrans.map(t => (t.s, t.o)).collect) - val domainMap = domainTriples.map(t => (t.s, t.o)).collect.toMap - val rangeMap = rangeTriples.map(t => (t.s, t.o)).collect.toMap + val subClassOfMap = CollectionUtils.toMultiMap(subClassOfTriplesTrans.map(t => (t.getSubject, t.getObject)).collect) + val subPropertyMap = CollectionUtils.toMultiMap(subPropertyOfTriplesTrans.map(t => (t.getSubject, t.getObject)).collect) + val domainMap = domainTriples.map(t => (t.getSubject, t.getObject)).collect.toMap + val rangeMap = rangeTriples.map(t => (t.getSubject, t.getObject)).collect.toMap // TODO broadcast schema in with Flink // // distribute the schema data structures by means of shared variables @@ -74,39 +75,39 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // rdfp12c: (?C rdfs:subClassOf ?D ), (?D rdfs:subClassOf ?C ) -> (?C owl:equivalentClass ?D) val equivClassTriplesInf = equivClassTriples.union( subClassOfTriplesTrans - .filter(t => subClassOfMap.getOrElse(t.o, Set.empty).contains(t.s)) - .map(t => RDFTriple(t.s, OWL2.equivalentClass.getURI, t.o)) + .filter(t => subClassOfMap.getOrElse(t.getObject, Set.empty).contains(t.getSubject)) + .map(t => Triple.create(t.getSubject, OWL2.equivalentClass.asNode(), t.getObject)) ) // rdfp13c: (?C rdfs:subPropertyOf ?D ), (?D rdfs:subPropertyOf ?C ) -> (?C owl:equivalentProperty ?D) val equivPropTriplesInf = equivPropertyTriples.union( subPropertyOfTriplesTrans - .filter(t => subPropertyMap.getOrElse(t.o, Set.empty).contains(t.s)) - .map(t => RDFTriple(t.s, OWL2.equivalentProperty.getURI, t.o)) + .filter(t => subPropertyMap.getOrElse(t.getObject, Set.empty).contains(t.getSubject)) + .map(t => Triple.create(t.getSubject, OWL2.equivalentProperty.asNode(), t.getObject)) ) // we also extract properties with certain OWL characteristic and share them val transitiveProperties = - extractTriples(triplesRDD, None, None, Some(OWL2.TransitiveProperty.getURI)) - .map(triple => triple.s) + extractTriples(triplesRDD, None, None, Some(OWL2.TransitiveProperty.asNode())) + .map(triple => triple.getSubject) .collect() val functionalProperties = - extractTriples(triplesRDD, None, None, Some(OWL2.FunctionalProperty.getURI)) - .map(triple => triple.s) + extractTriples(triplesRDD, None, None, Some(OWL2.FunctionalProperty.asNode())) + .map(triple => triple.getSubject) .collect() val inverseFunctionalProperties = - extractTriples(triplesRDD, None, None, Some(OWL2.InverseFunctionalProperty.getURI)) - .map(triple => triple.s) + extractTriples(triplesRDD, None, None, Some(OWL2.InverseFunctionalProperty.asNode())) + .map(triple => triple.getSubject) .collect() val symmetricProperties = - extractTriples(triplesRDD, None, None, Some(OWL2.SymmetricProperty.getURI)) - .map(triple => triple.s) + extractTriples(triplesRDD, None, None, Some(OWL2.SymmetricProperty.asNode())) + .map(triple => triple.getSubject) .collect() // and inverse property definitions val inverseOfMap = - extractTriples(triplesRDD, None, Some(OWL2.inverseOf.getURI), None) - .map(triple => (triple.s, triple.o)) + extractTriples(triplesRDD, None, Some(OWL2.inverseOf.asNode()), None) + .map(triple => (triple.getSubject, triple.getObject)) .collect() .toMap val inverseOfMapReverted = inverseOfMap.map(_.swap) @@ -114,38 +115,39 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // and more OWL vocabulary used in property restrictions // owl:someValuesFrom val someValuesFromMap = - extractTriples(triplesRDD, None, Some(OWL2.someValuesFrom.getURI), None) - .map(triple => (triple.s, triple.o)) + extractTriples(triplesRDD, None, Some(OWL2.someValuesFrom.asNode()), None) + .map(triple => (triple.getSubject, triple.getObject)) .collect() .toMap val someValuesFromMapReversed = someValuesFromMap.map(_.swap) // owl:allValuesFrom val allValuesFromMap = - extractTriples(triplesRDD, None, Some(OWL2.allValuesFrom.getURI), None) - .map(triple => (triple.s, triple.o)) + extractTriples(triplesRDD, None, Some(OWL2.allValuesFrom.asNode()), None) + .map(triple => (triple.getSubject, triple.getObject)) .collect() .toMap val allValuesFromMapReversed = allValuesFromMap.map(_.swap) // owl:hasValue val hasValueMap = - extractTriples(triplesRDD, None, Some(OWL2.hasValue.getURI), None) - .map(triple => (triple.s, triple.o)) + extractTriples(triplesRDD, None, Some(OWL2.hasValue.asNode()), None) + .map(triple => (triple.getSubject, triple.getObject)) .collect() .toMap val hasValueMapReversed = hasValueMap.groupBy(_._2).mapValues(_.keys).map(identity) // owl:onProperty val onPropertyMap = - extractTriples(triplesRDD, None, Some(OWL2.onProperty.getURI), None) - .map(triple => (triple.s, triple.o)) + extractTriples(triplesRDD, None, Some(OWL2.onProperty.asNode()), None) + .map(triple => (triple.getSubject, triple.getObject)) .collect() .toMap val onPropertyMapReversed = onPropertyMap.groupBy(_._2).mapValues(_.keys).map(identity) // owl:sameAs is computed separately, thus, we split the data - var triplesFiltered = triplesRDD.filter(triple => triple.p != OWL2.sameAs.getURI && triple.p != RDF.`type`.getURI) - var sameAsTriples = triplesRDD.filter(triple => triple.p == OWL2.sameAs.getURI) - var typeTriples = triplesRDD.filter(triple => triple.p == RDF.`type`.getURI) + var triplesFiltered = triplesRDD.filter(triple => !triple.predicateMatches(OWL2.sameAs.asNode()) + && !triple.predicateMatches(RDF.`type`.asNode())) + var sameAsTriples = triplesRDD.filter(triple => triple.predicateMatches(OWL2.sameAs.asNode())) + var typeTriples = triplesRDD.filter(triple => triple.predicateMatches(RDF.`type`.asNode())) // println("input rdf:type triples:\n" + typeTriples.collect().mkString("\n")) @@ -164,8 +166,8 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule */ val triplesRDFS7 = triplesFiltered - .filter(t => subPropertyMap.contains(t.p)) - .flatMap(t => subPropertyMap(t.p).map(supProp => RDFTriple(t.s, supProp, t.o))) + .filter(t => subPropertyMap.contains(t.getPredicate)) + .flatMap(t => subPropertyMap(t.getPredicate).map(supProp => Triple.create(t.getSubject, supProp, t.getObject))) // add the inferred triples to the existing triples val rdfs7Res = triplesRDFS7.union(triplesFiltered) @@ -178,8 +180,8 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule */ val triplesRDFS2 = rdfs7Res - .filter(t => domainMap.contains(t.p)) - .map(t => RDFTriple(t.s, RDF.`type`.getURI, domainMap(t.p))) + .filter(t => domainMap.contains(t.getPredicate)) + .map(t => Triple.create(t.getSubject, RDF.`type`.asNode(), domainMap(t.getPredicate))) /* rdfs3 aaa rdfs:range xxx . @@ -187,8 +189,8 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule */ val triplesRDFS3 = rdfs7Res - .filter(t => rangeMap.contains(t.p)) - .map(t => RDFTriple(t.o, RDF.`type`.getURI, rangeMap(t.p))) + .filter(t => rangeMap.contains(t.getPredicate)) + .map(t => Triple.create(t.getObject, RDF.`type`.asNode(), rangeMap(t.getPredicate))) // 4. SubClass inheritance according to rdfs9 @@ -202,18 +204,18 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule triplesRDFS2 .union(triplesRDFS3) .union(typeTriples) - .filter(t => subClassOfMap.contains(t.o)) // such that A has a super class B - .flatMap(t => subClassOfMap(t.o).map(supCls => RDFTriple(t.s, RDF.`type`.getURI, supCls))) // create triple (s a B) + .filter(t => subClassOfMap.contains(t.getObject)) // such that A has a super class B + .flatMap(t => subClassOfMap(t.getObject).map(supCls => Triple.create(t.getSubject, RDF.`type`.asNode(), supCls))) // create triple (s a B) // rdfp14b: (?R owl:hasValue ?V),(?R owl:onProperty ?P),(?X rdf:type ?R ) -> (?X ?P ?V ) val rdfp14b = typeTriples .filter(triple => - hasValueMap.contains(triple.o) && - onPropertyMap.contains(triple.o) + hasValueMap.contains(triple.getObject) && + onPropertyMap.contains(triple.getObject) ) .map(triple => - RDFTriple(triple.s, onPropertyMap(triple.o), hasValueMap(triple.o)) + Triple.create(triple.getSubject, onPropertyMap(triple.getObject), hasValueMap(triple.getObject)) ) // rdfp14a: (?R owl:hasValue ?V), (?R owl:onProperty ?P), (?U ?P ?V) -> (?U rdf:type ?R) @@ -221,12 +223,12 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule val rdfp14a = rdfs7Res .filter(triple => { var valueRestrictionExists = false - if (onPropertyMapReversed.contains(triple.p)) { + if (onPropertyMapReversed.contains(triple.getPredicate)) { // there is any restriction R for property P - onPropertyMapReversed(triple.p).foreach { restriction => + onPropertyMapReversed(triple.getPredicate).foreach { restriction => if (hasValueMap.contains(restriction) && // R a hasValue restriction - hasValueMap(restriction) == triple.o) { + hasValueMap(restriction) == triple.getObject) { // with value V valueRestrictionExists = true } @@ -236,74 +238,74 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule }) .map(triple => { - val s = triple.s - val p = RDF.`type`.getURI - var o = "" - onPropertyMapReversed(triple.p).foreach { restriction => // get the restriction R + val s = triple.getSubject + val p = RDF.`type`.asNode() + var o: Node = null + onPropertyMapReversed(triple.getPredicate).foreach { restriction => // get the restriction R if (hasValueMap.contains(restriction) && // R a hasValue restriction - hasValueMap(restriction) == triple.o) { // with value V + hasValueMap(restriction) == triple.getObject) { // with value V o = restriction } } - RDFTriple(s, p, o) + Triple.create(s, p, o) } ) println(rdfp14a.collect().mkString("\n")) // rdfp8a: (?P owl:inverseOf ?Q), (?X ?P ?Y) -> (?Y ?Q ?X) val rdfp8a = triplesFiltered - .filter(triple => inverseOfMap.contains(triple.p)) - .map(triple => RDFTriple(triple.o, inverseOfMap(triple.p), triple.s)) + .filter(triple => inverseOfMap.contains(triple.getPredicate)) + .map(triple => Triple.create(triple.getObject, inverseOfMap(triple.getPredicate), triple.getSubject)) // rdfp8b: (?P owl:inverseOf ?Q), (?X ?Q ?Y) -> (?Y ?P ?X) val rdfp8b = triplesFiltered - .filter(triple => inverseOfMapReverted.contains(triple.p)) - .map(triple => RDFTriple(triple.o, inverseOfMapReverted(triple.p), triple.s)) + .filter(triple => inverseOfMapReverted.contains(triple.getPredicate)) + .map(triple => Triple.create(triple.getObject, inverseOfMapReverted(triple.getPredicate), triple.getSubject)) // rdfp3: (?P rdf:type owl:SymmetricProperty), (?X ?P ?Y) -> (?Y ?P ?X) val rdfp3 = triplesFiltered - .filter(triple => symmetricProperties.contains(triple.p)) - .map(triple => RDFTriple(triple.o, triple.p, triple.s)) + .filter(triple => symmetricProperties.contains(triple.getPredicate)) + .map(triple => Triple.create(triple.getObject, triple.getPredicate, triple.getSubject)) // rdfp15: (?R owl:someValuesFrom ?D), (?R owl:onProperty ?P), (?X ?P ?A), (?A rdf:type ?D ) -> (?X rdf:type ?R ) val rdfp15_1 = triplesFiltered - .filter(triple => onPropertyMapReversed.contains(triple.p)) // && someValuesFromMapBC.value.contains(onPropertyMapReversedBC.value(triple.predicate))) + .filter(triple => onPropertyMapReversed.contains(triple.getPredicate)) // && someValuesFromMapBC.value.contains(onPropertyMapReversedBC.value(triple.predicate))) .flatMap(triple => { - val restrictions = onPropertyMapReversed(triple.p) - restrictions.map(_r => (_r -> triple.o, triple.s)) // -> ((?R, ?A), ?X) + val restrictions = onPropertyMapReversed(triple.getPredicate) + restrictions.map(_r => (_r -> triple.getObject, triple.getSubject)) // -> ((?R, ?A), ?X) }) // .flatMap(identity) val rdfp15_2 = typeTriples - .filter(triple => someValuesFromMapReversed.contains(triple.o)) - .map(triple => ((someValuesFromMapReversed(triple.o), triple.s), "s")) // -> ((?R, ?A), NIL) + .filter(triple => someValuesFromMapReversed.contains(triple.getObject)) + .map(triple => ((someValuesFromMapReversed(triple.getObject), triple.getSubject), "s")) // -> ((?R, ?A), NIL) val rdfp15 = rdfp15_1 .join(rdfp15_2).where(0).equalTo(0)({// ((?R, ?A), ?X) x ((?R, ?A), NIL) (l, r) => (l._2, r._1._1) }) - .map(e => RDFTriple(e._1, RDF.`type`.getURI, e._2)) // -> (?X rdf:type ?R ) + .map(e => Triple.create(e._1, RDF.`type`.asNode(), e._2)) // -> (?X rdf:type ?R ) // println(rdfp15.collect().mkString("\n")) // rdfp16: (?R owl:allValuesFrom ?D), (?R owl:onProperty ?P), (?X ?P ?Y), (?X rdf:type ?R ) -> (?Y rdf:type ?D ) val rdfp16_1 = triplesFiltered // (?X ?P ?Y) - .filter(triple => onPropertyMapReversed.contains(triple.p) && - allValuesFromMap.keySet.intersect(onPropertyMapReversed(triple.p).toSet).nonEmpty) // (?R owl:allValuesFrom ?D), (?R owl:onProperty ?P) + .filter(triple => onPropertyMapReversed.contains(triple.getPredicate) && + allValuesFromMap.keySet.intersect(onPropertyMapReversed(triple.getPredicate).toSet).nonEmpty) // (?R owl:allValuesFrom ?D), (?R owl:onProperty ?P) .flatMap(triple => { - val restrictions = onPropertyMapReversed(triple.p) - restrictions.map(_r => (triple.s -> _r, triple.o)) // -> ((?X, ?R), ?Y) + val restrictions = onPropertyMapReversed(triple.getPredicate) + restrictions.map(_r => (triple.getSubject -> _r, triple.getObject)) // -> ((?X, ?R), ?Y) }) // .flatMap(identity) // println("rdfp16_1:\n" + rdfp16_1.collect().mkString("\n")) val rdfp16_2 = typeTriples // (?X rdf:type ?R ) - .filter(triple => allValuesFromMap.contains(triple.o) && onPropertyMap.contains(triple.o)) // (?R owl:allValuesFrom ?D), (?R owl:onProperty ?P) - .map(triple => ((triple.s, triple.o), allValuesFromMap(triple.o))) // -> ((?X, ?R), ?D) + .filter(triple => allValuesFromMap.contains(triple.getObject) && onPropertyMap.contains(triple.getObject)) // (?R owl:allValuesFrom ?D), (?R owl:onProperty ?P) + .map(triple => ((triple.getSubject, triple.getObject), allValuesFromMap(triple.getObject))) // -> ((?X, ?R), ?D) // println("rdfp16_2:\n" + rdfp16_2.collect().mkString("\n")) @@ -311,7 +313,7 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule .join(rdfp16_2).where(0).equalTo(0) ({// ((?X, ?R), ?Y) x ((?X, ?R), ?D) (l, r) => (l._2, r._2) // -> (Y, D) }) - .map(e => RDFTriple(e._1, RDF.`type`.getURI, e._2)) // -> (?Y rdf:type ?D ) + .map(e => Triple.create(e._1, RDF.`type`.asNode(), e._2)) // -> (?Y rdf:type ?D ) // println(rdfp15.collect().mkString("\n")) // deduplicate @@ -326,7 +328,7 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule triplesFiltered = triplesFiltered.union(triplesNew) // rdfp4: (?P rdf:type owl:TransitiveProperty), (?X ?P ?Y), (?Y ?P ?Z) -> (?X ?P ?Z) - val rdfp4 = computeTransitiveClosure(triplesFiltered.filter(triple => transitiveProperties.contains(triple.p))) + val rdfp4 = computeTransitiveClosure(triplesFiltered.filter(triple => transitiveProperties.contains(triple.getPredicate))) // add triples triplesFiltered = triplesFiltered.union(rdfp4) @@ -352,8 +354,8 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // rdfp1: (?P rdf:type owl:FunctionalProperty), (?A ?P ?B), notLiteral(?B), (?A ?P ?C), notLiteral(?C), notEqual(?B ?C) -> (?B owl:sameAs ?C) val rdfp1_1 = triplesFiltered - .filter(triple => functionalProperties.contains(triple.p)) - .map(triple => (triple.s, triple.p) -> triple.o) // -> ((?A, ?P), ?B) + .filter(triple => functionalProperties.contains(triple.getPredicate)) + .map(triple => (triple.getSubject, triple.getPredicate) -> triple.getObject) // -> ((?A, ?P), ?B) // println(rdfp1_1.collect().mkString("\n")) // println("Joined:" + rdfp1_1.join(rdfp1_1).collect().mkString("\n")) // apply self join @@ -362,18 +364,18 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule (l, r) => (l._2, r._2) // -> (?B, ?C) }) .filter(e => e._1 != e._2) // notEqual(?B ?C) - .map(e => RDFTriple(e._1, OWL2.sameAs.getURI, e._2)) // -> (?B owl:sameAs ?C) + .map(e => Triple.create(e._1, OWL2.sameAs.asNode(), e._2)) // -> (?B owl:sameAs ?C) // rdfp2: (?P rdf:type owl:InverseFunctionalProperty), (?A ?P ?B), (?C ?P ?B), notEqual(?A ?C) -> (?A owl:sameAs ?C) val rdfp2_1 = triplesFiltered - .filter(triple => inverseFunctionalProperties.contains(triple.p)) - .map(triple => (triple.o, triple.p) -> triple.s) // -> ((?B, ?P), ?A) + .filter(triple => inverseFunctionalProperties.contains(triple.getPredicate)) + .map(triple => (triple.getObject, triple.getPredicate) -> triple.getSubject) // -> ((?B, ?P), ?A) val rdfp2 = rdfp2_1 .join(rdfp2_1).where(0).equalTo(0) ({// ((?B, ?P), ?A) x ((?B, ?P), ?C) (l, r) => (l._2, r._2) // -> (?A, ?C) }) .filter(e => e._1 != e._2) // notEqual(?A ?C) - .map(e => RDFTriple(e._1, OWL2.sameAs.getURI, e._2)) // -> (?A owl:sameAs ?C) + .map(e => Triple.create(e._1, OWL2.sameAs.asNode(), e._2)) // -> (?A owl:sameAs ?C) triplesFiltered = triplesFiltered.union(rdfp1).union(rdfp2) @@ -400,7 +402,7 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule RDFGraph(inferredTriples) } - def deduplicate(triples: DataSet[RDFTriple]): DataSet[RDFTriple] = { + def deduplicate(triples: DataSet[Triple]): DataSet[Triple] = { triples.distinct() } @@ -421,7 +423,7 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // // val rdfp15 = rdfp15_1 // .join(rdfp15_2) -// .map(e => RDFTriple(e._2._1, RDF.`type`.getURI, e._1._1)) // -> (?X rdf:type ?R ) +// .map(e => Triple.create(e._2._1, RDF.`type`.asNode(), e._1._1)) // -> (?X rdf:type ?R ) // // rdfp15 // } diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala index 88d2c95..a1538b7 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala @@ -4,13 +4,13 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.flink.api.common.functions.{RichFilterFunction, RichFlatMapFunction} -import org.apache.flink.api.scala.{ExecutionEnvironment, _} +import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment, _} import org.apache.flink.configuration.Configuration import org.apache.flink.util.Collector +import org.apache.jena.graph.Triple import org.apache.jena.vocabulary.{RDF, RDFS} import org.slf4j.LoggerFactory -import net.sansa_stack.inference.data.RDFTriple import net.sansa_stack.inference.flink.data.RDFGraph import net.sansa_stack.inference.flink.extraction.RDFSSchemaExtractor import net.sansa_stack.inference.flink.utils.DataSetUtils.DataSetOps @@ -54,7 +54,7 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas * yyy rdfs:subClassOf zzz . xxx rdfs:subClassOf zzz . */ val subClassOfTriples = - extractTriples(schemaTriples, RDFS.subClassOf.getURI) + extractTriples(schemaTriples, RDFS.subClassOf.asNode()) .name("rdfs:subClassOf") // extract rdfs:subClassOf triples val subClassOfTriplesTrans = computeTransitiveClosureOptSemiNaive(subClassOfTriples).name("rdfs11") @@ -64,13 +64,13 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas yyy rdfs:subPropertyOf zzz . => xxx rdfs:subPropertyOf zzz . */ val subPropertyOfTriples = - extractTriples(schemaTriples, RDFS.subPropertyOf.getURI) + extractTriples(schemaTriples, RDFS.subPropertyOf.asNode()) .name("rdfs:subPropertyOf") // extract rdfs:subPropertyOf triples val subPropertyOfTriplesTrans = computeTransitiveClosureOptSemiNaive(subPropertyOfTriples).name("rdfs5") // split by rdf:type - val split = triplesDS.partitionBy(t => t.p == RDF.`type`.getURI) + val split = triplesDS.partitionBy(t => t.predicateMatches(RDF.`type`.asNode())) var typeTriples = split._1 var otherTriples = split._2 @@ -82,39 +82,39 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas */ val triplesRDFS7 = if (useSchemaBroadCasting) { otherTriples - .filter(new RichFilterFunction[RDFTriple]() { + .filter(new RichFilterFunction[Triple]() { - var broadcastSet: Traversable[RDFTriple] = _ + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple]("subPropertyTriples").asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple]("subPropertyTriples").asScala } - override def filter(t: RDFTriple): Boolean = broadcastSet.exists(_.s == t.p) + override def filter(t: Triple): Boolean = broadcastSet.exists(_.subjectMatches(t.getPredicate)) }) .withBroadcastSet(subPropertyOfTriplesTrans, "subPropertyTriples") // .flatMap(new SubClassOfFlatMapFunction("subClasses")).withBroadcastSet(subClassOfTriplesTrans, "subClasses") // create triple (s a B) - .flatMap(new RichFlatMapFunction[RDFTriple, RDFTriple]() { - var broadcastSet: Traversable[RDFTriple] = _ + .flatMap(new RichFlatMapFunction[Triple, Triple]() { + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple]("subPropertyTriples").asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple]("subPropertyTriples").asScala } - override def flatMap(in: RDFTriple, collector: Collector[RDFTriple]): Unit = { + override def flatMap(in: Triple, collector: Collector[Triple]): Unit = { broadcastSet - .filter(_.s == in.p) - .foreach(t => collector.collect(RDFTriple(in.s, t.o, in.o))) + .filter(_.subjectMatches(in.getPredicate)) + .foreach(t => collector.collect(Triple.create(in.getSubject, t.getObject, in.getObject))) } }) .withBroadcastSet(subPropertyOfTriplesTrans, "subPropertyTriples") } else { val subPropertyMap = - CollectionUtils.toMultiMap(subPropertyOfTriplesTrans.map(t => (t.s, t.o)).collect) + CollectionUtils.toMultiMap(subPropertyOfTriplesTrans.map(t => (t.getSubject, t.getObject)).collect) otherTriples // all triples (s p1 o) - .filter(t => subPropertyMap.contains(t.p)) // such that p1 has a super property p2 - .flatMap(t => subPropertyMap(t.p).map(supProp => RDFTriple(t.s, supProp, t.o))) // create triple (s p2 o) + .filter(t => subPropertyMap.contains(t.getPredicate)) // such that p1 has a super property p2 + .flatMap(t => subPropertyMap(t.getPredicate).map(supProp => Triple.create(t.getSubject, supProp, t.getObject))) // create triple (s p2 o) }.name("rdfs7") @@ -128,42 +128,42 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas yyy aaa zzz . => yyy rdf:type xxx . */ val domainTriples = - extractTriples(schemaTriples, RDFS.domain.getURI).name("rdfs:domain") + extractTriples(schemaTriples, RDFS.domain.asNode()).name("rdfs:domain") val triplesRDFS2 = if (useSchemaBroadCasting) { otherTriples - .filter(new RichFilterFunction[RDFTriple]() { + .filter(new RichFilterFunction[Triple]() { - var broadcastSet: Traversable[RDFTriple] = _ + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple]("domainTriples").asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple]("domainTriples").asScala } - override def filter(t: RDFTriple): Boolean = - broadcastSet.exists(_.s == t.p) + override def filter(t: Triple): Boolean = + broadcastSet.exists(_.subjectMatches(t.getPredicate)) }) .withBroadcastSet(domainTriples, "domainTriples") - .flatMap(new RichFlatMapFunction[RDFTriple, RDFTriple]() { - var broadcastSet: Traversable[RDFTriple] = _ + .flatMap(new RichFlatMapFunction[Triple, Triple]() { + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple]("domainTriples").asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple]("domainTriples").asScala } - override def flatMap(in: RDFTriple, collector: Collector[RDFTriple]): Unit = { + override def flatMap(in: Triple, collector: Collector[Triple]): Unit = { broadcastSet - .filter(_.s == in.p) - .foreach(t => collector.collect(RDFTriple(in.s, RDF.`type`.getURI, t.o))) + .filter(_.subjectMatches(in.getPredicate)) + .foreach(t => collector.collect(Triple.create(in.getSubject, RDF.`type`.asNode(), t.getObject))) } }) .withBroadcastSet(domainTriples, "domainTriples") } else { - val domainMap = domainTriples.map(t => (t.s, t.o)).collect.toMap + val domainMap = domainTriples.map(t => (t.getSubject, t.getObject)).collect.toMap otherTriples - .filter(t => domainMap.contains(t.p)) - .map(t => RDFTriple(t.s, RDF.`type`.getURI, domainMap(t.p))) + .filter(t => domainMap.contains(t.getPredicate)) + .map(t => Triple.create(t.getSubject, RDF.`type`.asNode(), domainMap(t.getPredicate))) }.name("rdfs2") @@ -172,42 +172,42 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas yyy aaa zzz . => zzz rdf:type xxx . */ val rangeTriples = - extractTriples(schemaTriples, RDFS.range.getURI).name("rdfs:range") + extractTriples(schemaTriples, RDFS.range.asNode()).name("rdfs:range") val triplesRDFS3 = if (useSchemaBroadCasting) { otherTriples - .filter(new RichFilterFunction[RDFTriple]() { + .filter(new RichFilterFunction[Triple]() { - var broadcastSet: Traversable[RDFTriple] = _ + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple]("rangeTriples").asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple]("rangeTriples").asScala } - override def filter(t: RDFTriple): Boolean = - broadcastSet.exists(_.s == t.p) + override def filter(t: Triple): Boolean = + broadcastSet.exists(_.subjectMatches(t.getPredicate)) }) .withBroadcastSet(rangeTriples, "rangeTriples") - .flatMap(new RichFlatMapFunction[RDFTriple, RDFTriple]() { - var broadcastSet: Traversable[RDFTriple] = _ + .flatMap(new RichFlatMapFunction[Triple, Triple]() { + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple]("rangeTriples").asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple]("rangeTriples").asScala } - override def flatMap(in: RDFTriple, collector: Collector[RDFTriple]): Unit = { + override def flatMap(in: Triple, collector: Collector[Triple]): Unit = { broadcastSet - .filter(_.s == in.p) - .foreach(t => collector.collect(RDFTriple(in.o, RDF.`type`.getURI, t.o))) + .filter(_.subjectMatches(in.getPredicate)) + .foreach(t => collector.collect(Triple.create(in.getObject, RDF.`type`.asNode(), t.getObject))) } }) .withBroadcastSet(rangeTriples, "rangeTriples") } else { - val rangeMap = rangeTriples.map(t => (t.s, t.o)).collect().toMap + val rangeMap = rangeTriples.map(t => (t.getSubject, t.getObject)).collect().toMap otherTriples - .filter(t => rangeMap.contains(t.p)) - .map(t => RDFTriple(t.o, RDF.`type`.getURI, rangeMap(t.p))) + .filter(t => rangeMap.contains(t.getPredicate)) + .map(t => Triple.create(t.getObject, RDF.`type`.asNode(), rangeMap(t.getPredicate))) }.name("rdfs3") @@ -226,42 +226,42 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas val triplesRDFS9 = if (useSchemaBroadCasting) { typeTriples // all rdf:type triples (s a A) // .filter(new SubClassOfFilterFunction("subClasses")).withBroadcastSet(subClassOfTriplesTrans, "subClasses") // such that A has a super class B - .filter(new RichFilterFunction[RDFTriple]() { + .filter(new RichFilterFunction[Triple]() { - var broadcastSet: Traversable[RDFTriple] = _ + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple]("subClassTriples").asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple]("subClassTriples").asScala } - override def filter(t: RDFTriple): Boolean = - broadcastSet.exists(_.s == t.o) + override def filter(t: Triple): Boolean = + broadcastSet.exists(_.subjectMatches(t.getObject)) }) .withBroadcastSet(subClassOfTriplesTrans, "subClassTriples") // .flatMap(new SubClassOfFlatMapFunction("subClasses")).withBroadcastSet(subClassOfTriplesTrans, "subClasses") // create triple (s a B) - .flatMap(new RichFlatMapFunction[RDFTriple, RDFTriple]() { - var broadcastSet: Traversable[RDFTriple] = _ + .flatMap(new RichFlatMapFunction[Triple, Triple]() { + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple]("subClassTriples").asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple]("subClassTriples").asScala } - override def flatMap(in: RDFTriple, collector: Collector[RDFTriple]): Unit = { + override def flatMap(in: Triple, collector: Collector[Triple]): Unit = { broadcastSet - .filter(t => in.o == t.s) - .foreach(t => collector.collect(RDFTriple(in.s, in.p, t.o))) + .filter(t => in.objectMatches(t.getSubject)) + .foreach(t => collector.collect(Triple.create(in.getSubject, in.getPredicate, t.getObject))) } }) .withBroadcastSet(subClassOfTriplesTrans, "subClassTriples") } else { - val subClassOfMap = CollectionUtils.toMultiMap(subClassOfTriplesTrans.map(t => (t.s, t.o)).collect) + val subClassOfMap = CollectionUtils.toMultiMap(subClassOfTriplesTrans.map(t => (t.getSubject, t.getObject)).collect) typeTriples // all rdf:type triples (s a A) - .filter(t => subClassOfMap.contains(t.o)) // such that A has a super class B + .filter(t => subClassOfMap.contains(t.getObject)) // such that A has a super class B .flatMap( t => - subClassOfMap(t.o) - .map(supCls => RDFTriple(t.s, RDF.`type`.getURI, supCls)) + subClassOfMap(t.getObject) + .map(supCls => Triple.create(t.getSubject, RDF.`type`.asNode(), supCls)) ) // create triple (s a B) }.name("rdfs9") @@ -284,39 +284,39 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas .flatMap( t => Set( - RDFTriple(t.s, RDF.`type`.getURI, RDFS.Resource.getURI), - RDFTriple(t.o, RDF.`type`.getURI, RDFS.Resource.getURI) - // RDFTriple(t.predicate, RDF.`type`.getURI, RDF.Property.getURI) + Triple.create(t.getSubject, RDF.`type`.asNode(), RDFS.Resource.asNode()), + Triple.create(t.getObject, RDF.`type`.asNode(), RDFS.Resource.asNode()) + // Triple(t.predicate, RDF.`type`.getURI, RDF.Property.getURI) ) ) .name("rdfs4") // rdfs12: (?x rdf:type rdfs:ContainerMembershipProperty) -> (?x rdfs:subPropertyOf rdfs:member) val rdfs12 = typeTriples - .filter(t => t.o == RDFS.ContainerMembershipProperty.getURI) - .map(t => RDFTriple(t.s, RDF.`type`.getURI, RDFS.member.getURI)) + .filter(t => t.objectMatches(RDFS.ContainerMembershipProperty.asNode())) + .map(t => Triple.create(t.getSubject, RDF.`type`.asNode(), RDFS.member.asNode())) .name("rdfs12") // rdfs6: (p rdf:type rdf:Property) => (p rdfs:subPropertyOf p) val rdfs6 = typeTriples - .filter(t => t.o == RDF.Property.getURI) - .map(t => RDFTriple(t.s, RDFS.subPropertyOf.getURI, t.s)) + .filter(t => t.objectMatches(RDF.Property.asNode())) + .map(t => Triple.create(t.getSubject, RDFS.subPropertyOf.asNode(), t.getSubject)) .name("rdfs6") // rdfs8: (s rdf:type rdfs:Class ) => (s rdfs:subClassOf rdfs:Resource) // rdfs10: (s rdf:type rdfs:Class) => (s rdfs:subClassOf s) val rdfs8_10 = typeTriples - .filter(t => t.o == RDFS.Class.getURI) + .filter(t => t.objectMatches(RDFS.Class.asNode())) .flatMap( t => Set( - RDFTriple(t.s, RDFS.subClassOf.getURI, RDFS.Resource.getURI), - RDFTriple(t.s, RDFS.subClassOf.getURI, t.s) + Triple.create(t.getSubject, RDFS.subClassOf.asNode(), RDFS.Resource.asNode()), + Triple.create(t.getSubject, RDFS.subClassOf.asNode(), t.getSubject) ) ) .name("rdfs8/rdfs10") - val additionalTripleRDDs = mutable.Seq(rdfs4, rdfs6, rdfs8_10) + val additionalTripleRDDs = mutable.Seq(rdfs4, rdfs6, rdfs8_10, rdfs12) allTriples = env.union(Seq(allTriples) ++ additionalTripleRDDs).distinct() } @@ -332,43 +332,43 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas RDFGraph(allTriples) } - object SchemaTriplesFilter extends ((RDFTriple) => Boolean) with Serializable { + object SchemaTriplesFilter extends (Triple => Boolean) with Serializable { - val schemaPredicates = - Set(RDFS.subClassOf.getURI, RDFS.subPropertyOf.getURI, RDFS.domain.getURI, RDFS.range.getURI) + private val schemaPredicates = + Set(RDFS.subClassOf, RDFS.subPropertyOf, RDFS.domain, RDFS.range).map(_.asNode()) - override def apply(t: RDFTriple): Boolean = schemaPredicates.contains(t.p) + override def apply(t: Triple): Boolean = schemaPredicates.contains(t.getPredicate) } - private def extractSchemaTriples(triples: DataSet[RDFTriple]): DataSet[RDFTriple] = { + private def extractSchemaTriples(triples: DataSet[Triple]): DataSet[Triple] = { triples.filter(SchemaTriplesFilter).name("schemaTriples") } - class SubClassOfFilterFunction(predicate: String) extends RichFilterFunction[RDFTriple]() { + class SubClassOfFilterFunction(predicate: String) extends RichFilterFunction[Triple]() { - var broadcastSet: Traversable[RDFTriple] = _ + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { // Access the broadcasted DataSet as a Collection - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple](predicate).asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple](predicate).asScala } - override def filter(t: RDFTriple): Boolean = - broadcastSet.exists(_.s == t.o) + override def filter(t: Triple): Boolean = + broadcastSet.exists(_.subjectMatches(t.getObject)) } - class SubClassOfFlatMapFunction(predicate: String) extends RichFlatMapFunction[RDFTriple, RDFTriple]() { - var broadcastSet: Traversable[RDFTriple] = _ + class SubClassOfFlatMapFunction(predicate: String) extends RichFlatMapFunction[Triple, Triple]() { + var broadcastSet: Traversable[Triple] = _ override def open(config: Configuration): Unit = { // Access the broadcasted DataSet as a Collection - broadcastSet = getRuntimeContext().getBroadcastVariable[RDFTriple](predicate).asScala + broadcastSet = getRuntimeContext.getBroadcastVariable[Triple](predicate).asScala } - override def flatMap(in: RDFTriple, collector: Collector[RDFTriple]): Unit = { + override def flatMap(in: Triple, collector: Collector[Triple]): Unit = { broadcastSet - .filter(t => in.o == t.s) - .foreach(t => collector.collect(RDFTriple(in.s, in.p, t.o))) + .filter(t => in.objectMatches(t.getSubject)) + .foreach(t => collector.collect(Triple.create(in.getSubject, in.getPredicate, t.getObject))) } } diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/TransitiveReasoner.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/TransitiveReasoner.scala index c2f2765..8fda1fd 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/TransitiveReasoner.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/TransitiveReasoner.scala @@ -5,8 +5,8 @@ import scala.reflect.ClassTag import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.scala.{DataSet, _} import org.apache.flink.util.Collector +import org.apache.jena.graph.{Node, Triple} -import net.sansa_stack.inference.data.RDFTriple import net.sansa_stack.inference.utils.Profiler /** @@ -29,7 +29,7 @@ trait TransitiveReasoner extends Profiler{ * @param triples the set of triples * @return a set containing the transitive closure of the triples */ - def computeTransitiveClosure(triples: Set[RDFTriple]): Set[RDFTriple] = { + def computeTransitiveClosure(triples: Set[Triple]): Set[Triple] = { val tc = addTransitive(triples) // recursive call if set changed, otherwise stop and return if (tc.size == triples.size) triples else computeTransitiveClosure(tc) @@ -39,8 +39,9 @@ trait TransitiveReasoner extends Profiler{ // s ++ (for ((s1, p1, o1) <- s; (s2, p2, o2) <- s if o1 == s2) yield (s1, p1, o2)) // } - def addTransitive(triples: Set[RDFTriple]): Set[RDFTriple] = { - triples ++ (for (t1 <- triples; t2 <- triples if t1.o == t2.s) yield RDFTriple(t1.s, t1.p, t2.o)) + def addTransitive(triples: Set[Triple]): Set[Triple] = { + triples ++ (for (t1 <- triples; t2 <- triples if t1.objectMatches(t2.getSubject)) + yield Triple.create(t1.getSubject, t1.getPredicate, t2.getObject)) } /** @@ -50,16 +51,16 @@ trait TransitiveReasoner extends Profiler{ * @param triples the DataSet of triples * @return a DataSet containing the transitive closure of the triples */ - def computeTransitiveClosure(triples: DataSet[RDFTriple]): DataSet[RDFTriple] = { + def computeTransitiveClosure(triples: DataSet[Triple]): DataSet[Triple] = { if (triples.count() == 0) return triples log.info("computing TC...") profile { // keep the predicate - val predicate = triples.first(1).collect().head.p + val predicate = triples.first(1).collect().head.getPredicate // compute the TC - var subjectObjectPairs = triples.map(t => (t.s, t.o)) + var subjectObjectPairs = triples.map(t => (t.getSubject, t.getObject)) // because join() joins on keys, in addition the pairs are stored in reversed order (o, s) val objectSubjectPairs = subjectObjectPairs.map(t => (t._2, t._1)) @@ -86,7 +87,7 @@ trait TransitiveReasoner extends Profiler{ } while (nextCount != oldCount) log.info("TC has " + nextCount + " triples.") - subjectObjectPairs.map(p => RDFTriple(p._1, predicate, p._2)) + subjectObjectPairs.map(p => Triple.create(p._1, predicate, p._2)) } } @@ -99,20 +100,20 @@ trait TransitiveReasoner extends Profiler{ * @param triples the DataSet of triples * @return a DataSet containing the transitive closure of the triples */ - def computeTransitiveClosureOpt(triples: DataSet[RDFTriple]): DataSet[RDFTriple] = { + def computeTransitiveClosureOpt(triples: DataSet[Triple]): DataSet[Triple] = { if (triples.count() == 0) return triples log.info("computing TC...") profile { // keep the predicate - val predicate = triples.first(1).collect().head.p + val predicate = triples.first(1).collect().head.getPredicate // convert to tuples needed for the JOIN operator - val subjectObjectPairs = triples.map(t => (t.s, t.o)) + val subjectObjectPairs = triples.map(t => (t.getSubject, t.getObject)) // compute the TC val res = subjectObjectPairs.iterateWithTermination(10) { - prevPaths: DataSet[(String, String)] => + prevPaths: DataSet[(Node, Node)] => val nextPaths = prevPaths .join(subjectObjectPairs).where(1).equalTo(0) { @@ -125,7 +126,7 @@ trait TransitiveReasoner extends Profiler{ val terminate = prevPaths .coGroup(nextPaths) .where(0).equalTo(0) { - (prev, next, out: Collector[(String, String)]) => { + (prev, next, out: Collector[(Node, Node)]) => { val prevPaths = prev.toSet for (n <- next) if (!prevPaths.contains(n)) out.collect(n) @@ -135,7 +136,7 @@ trait TransitiveReasoner extends Profiler{ } // map back to RDF triples - res.map(p => RDFTriple(p._1, predicate, p._2)) + res.map(p => Triple.create(p._1, predicate, p._2)) } } @@ -184,15 +185,15 @@ trait TransitiveReasoner extends Profiler{ * @param triples the DataSet of triples * @return a DataSet containing the transitive closure of the triples */ - def computeTransitiveClosureOptSemiNaive(triples: DataSet[RDFTriple]): DataSet[RDFTriple] = { + def computeTransitiveClosureOptSemiNaive(triples: DataSet[Triple]): DataSet[Triple] = { log.info("computing TC...") - def iterate(s: DataSet[RDFTriple], ws: DataSet[RDFTriple]): (DataSet[RDFTriple], DataSet[RDFTriple]) = { + def iterate(s: DataSet[Triple], ws: DataSet[Triple]): (DataSet[Triple], DataSet[Triple]) = { val resolvedRedirects = triples.join(ws) - .where { _.s } - .equalTo { _.o } + .where { _.getSubject } + .equalTo { _.getObject } .map { joinResult => joinResult match { case (redirect, link) => - RDFTriple(link.s, redirect.p, redirect.o) + Triple.create(link.getSubject, redirect.getPredicate, redirect.getObject) } }.name("TC-From-Iteration") (resolvedRedirects, resolvedRedirects) From 69624cf52c984eb6561a77c23e3ae8ae250d64b3 Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Wed, 27 Jun 2018 12:55:51 +0200 Subject: [PATCH 05/23] Cont. Flink RDF --- pom.xml | 6 ++ sansa-inference-flink/pom.xml | 6 ++ .../flink/RDFGraphMaterializer.scala | 7 +-- .../inference/flink/data/RDFGraph.scala | 4 +- .../inference/flink/data/RDFGraphLoader.scala | 61 +++++++++++-------- .../inference/flink/data/RDFGraphWriter.scala | 15 ++--- .../inference/flink/RDFGraphTestCase.scala | 39 ++++++------ .../conformance/OWLHorstConformanceTest.scala | 19 +++--- .../conformance/RDFSConformanceTest.scala | 21 ++++--- .../flink/rules/TransitivityRuleTest.scala | 23 ++++--- 10 files changed, 111 insertions(+), 90 deletions(-) diff --git a/pom.xml b/pom.xml index aab5acb..824da62 100644 --- a/pom.xml +++ b/pom.xml @@ -110,6 +110,12 @@ sansa-rdf-partition-core ${sansa.rdf.version} + + net.sansa-stack + sansa-rdf-flink_${scala.binary.version} + ${sansa.rdf.version} + + net.sansa-stack diff --git a/sansa-inference-flink/pom.xml b/sansa-inference-flink/pom.xml index c037344..ab4613a 100644 --- a/sansa-inference-flink/pom.xml +++ b/sansa-inference-flink/pom.xml @@ -47,6 +47,12 @@ under the License. test + + + net.sansa-stack + sansa-rdf-flink_${scala.binary.version} + + org.apache.flink diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/RDFGraphMaterializer.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/RDFGraphMaterializer.scala index c265dfd..43797a1 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/RDFGraphMaterializer.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/RDFGraphMaterializer.scala @@ -74,7 +74,6 @@ object RDFGraphMaterializer { // load triples from disk val graph = RDFGraphLoader.loadFromDisk(input, env) -// println(s"|G| = ${graph.size()}") // create reasoner val reasoner = profile match { @@ -90,7 +89,7 @@ object RDFGraphMaterializer { // compute inferred graph val inferredGraph = reasoner.apply(graph) - println(s"|G_inf| = ${inferredGraph.size()}") + println(s"|G_inf| = ${inferredGraph.size}") // write triples to disk // RDFGraphWriter.writeToDisk(inferredGraph, output, writeToSingleFile, sortedOutput) @@ -119,7 +118,7 @@ object RDFGraphMaterializer { // the CLI parser val parser = new scopt.OptionParser[Config]("RDFGraphMaterializer") { - head("RDFGraphMaterializer", "0.1.0") + head("RDFGraphMaterializer", "0.4.0") // opt[Seq[File]]('i', "input").required().valueName(",,..."). // action((x, c) => c.copy(in = x)). @@ -128,7 +127,7 @@ object RDFGraphMaterializer { .required() .valueName("") .action((x, c) => c.copy(in = x)) - .text("path to file or directory that contains the input files (in N-Triple format)") + .text("path to file or directory that contains the input files (in N-Triples format)") opt[URI]('o', "out") .required() diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala index b8c388d..2ae0099 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala @@ -67,7 +67,5 @@ case class RDFGraph(triples: DataSet[Triple]) { * * @return the number of triples */ - def size(): Long = { - triples.count() - } + lazy val size: Long = triples.count() } diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala index e631d4a..3349929 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala @@ -1,17 +1,15 @@ package net.sansa_stack.inference.flink.data -import java.io.File import java.net.URI -import org.apache.flink.api.scala.{ExecutionEnvironment, _} - -import net.sansa_stack.inference.data.RDFTriple -import org.apache.flink.configuration.Configuration +import scala.collection.JavaConverters._ import scala.language.implicitConversions -import org.apache.jena.rdf.model.impl.NTripleReader +import org.apache.flink.api.scala.{ExecutionEnvironment, _} +import org.apache.jena.riot.{Lang, RDFDataMgr} + +import net.sansa_stack.rdf.benchmark.io.ReadableByteChannelFromIterator -import net.sansa_stack.inference.utils.NTriplesStringToRDFTriple /** * @author Lorenz Buehmann @@ -20,41 +18,50 @@ object RDFGraphLoader { implicit def pathURIsConverter(uris: Seq[URI]): String = uris.map(p => p.toString).mkString(",") - def loadFromFile(path: String, env: ExecutionEnvironment): RDFGraph = { - val triples = env.readTextFile(path) - .map(line => line.replace(">", "").replace("<", "").split("\\s+")) // line to tokens - .map(tokens => RDFTriple(tokens(0), tokens(1), tokens(2))) // tokens to triple - RDFGraph(triples) + def loadFromDisk(path: String, env: ExecutionEnvironment): RDFGraph = { + loadFromDisk(URI.create(path), env) } def loadFromDisk(path: URI, env: ExecutionEnvironment): RDFGraph = { - // create a configuration object - val parameters = new Configuration + loadFromDisk(Seq(path), env) + } + + def loadFromDisk(paths: Seq[URI], env: ExecutionEnvironment): RDFGraph = { +// // create a configuration object +// val parameters = new Configuration +// +// // set the recursive enumeration parameter +// parameters.setBoolean("recursive.file.enumeration", true) +// env.readTextFile(f).withParameters(parameters) - // set the recursive enumeration parameter - parameters.setBoolean("recursive.file.enumeration", true) + val tmp: List[String] = paths.map(path => path.toString).toList + val triples = tmp + .map(f => env.readTextFile(f)) // no support to read from multiple paths at once, thus, map + union here + .reduce(_ union _) // TODO Flink 1.5.0 supports multiple paths via FileInputFormat + .mapPartition(p => { + // convert iterator to input stream + val is = ReadableByteChannelFromIterator.toInputStream(p.asJava) - // pass the configuration to the data source - val triples = env.readTextFile(path.toString).withParameters(parameters) - .map(line => line.replace(">", "").replace("<", "").split("\\s+")) // line to tokens - .map(tokens => RDFTriple(tokens(0), tokens(1), tokens(2))) - .name("triples") // tokens to triple + RDFDataMgr.createIteratorTriples(is, Lang.NTRIPLES, null).asScala + }) + .name("triples") RDFGraph(triples) } - def loadFromDisk(paths: Seq[URI], env: ExecutionEnvironment): RDFGraph = { + def main(args: Array[String]): Unit = { + if (args.length == 0) println("Usage: RDFGraphLoader ") - val tmp: List[String] = paths.map(path => path.toString).toList + val path = args(0) - val converter = new NTriplesStringToRDFTriple() + val env = ExecutionEnvironment.getExecutionEnvironment - val triples = tmp - .map(f => env.readTextFile(f).flatMap(line => converter.apply(line))).reduce(_ union _).name("triples") + val ds = RDFGraphLoader.loadFromDisk(path, env).triples - RDFGraph(triples) + println(s"size:${ds.count}") + println("sample data:\n" + ds.first(10).map { _.toString.replaceAll("[\\x00-\\x1f]","???")}.collect().mkString("\n")) } } diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphWriter.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphWriter.scala index f421f72..ed085b8 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphWriter.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphWriter.scala @@ -1,6 +1,6 @@ package net.sansa_stack.inference.flink.data -import java.io.{ByteArrayInputStream, File} +import java.io.ByteArrayInputStream import java.net.URI import java.nio.charset.StandardCharsets @@ -8,10 +8,11 @@ import org.apache.flink.api.common.operators.Order import org.apache.flink.api.scala._ import org.apache.flink.core.fs.FileSystem import org.apache.jena.rdf.model.{Model, ModelFactory} - -import net.sansa_stack.inference.utils.{RDFTripleOrdering, RDFTripleToNTripleString} +import org.apache.jena.sparql.util.TripleComparator import org.slf4j.LoggerFactory +import net.sansa_stack.inference.utils.{JenaTripleToNTripleString, RDFTripleOrdering} + /** * Writes an RDF graph to disk. * @@ -26,10 +27,10 @@ object RDFGraphWriter { logger.info("writing triples to disk...") val startTime = System.currentTimeMillis() - implicit val ordering = RDFTripleOrdering + implicit val ordering = new TripleComparator() graph.triples.map(t => (t, t)).sortPartition(1, Order.DESCENDING).map(_._1) - .map(new RDFTripleToNTripleString()) // to N-TRIPLES string + .map(new JenaTripleToNTripleString()) // to N-Triples string .writeAsText(path, writeMode = FileSystem.WriteMode.OVERWRITE) logger.info("finished writing triples to disk in " + (System.currentTimeMillis()-startTime) + "ms.") @@ -61,14 +62,14 @@ object RDFGraphWriter { } tmp - .map(new RDFTripleToNTripleString()) // to N-TRIPLES string + .map(new JenaTripleToNTripleString()) // to N-TRIPLES string .writeAsText(path.toString, writeMode = FileSystem.WriteMode.OVERWRITE) logger.info("finished writing triples to disk in " + (System.currentTimeMillis()-startTime) + "ms.") } def convertToModel(graph: RDFGraph) : Model = { - val modelString = graph.triples.map(new RDFTripleToNTripleString()) + val modelString = graph.triples.map(new JenaTripleToNTripleString()) .collect().mkString("\n") val model = ModelFactory.createDefaultModel() diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/RDFGraphTestCase.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/RDFGraphTestCase.scala index 04e2f5b..e111e6d 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/RDFGraphTestCase.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/RDFGraphTestCase.scala @@ -3,17 +3,20 @@ package net.sansa_stack.inference.flink import java.util import java.util.Comparator +import scala.collection.JavaConverters._ + import com.google.common.collect.ComparisonChain -import net.sansa_stack.inference.flink.data.RDFGraph import org.apache.flink.api.scala.{ExecutionEnvironment, _} import org.apache.flink.test.util.MultipleProgramsTestBase.TestExecutionMode import org.apache.flink.test.util.{MultipleProgramsTestBase, TestBaseUtils} +import org.apache.jena.graph.{NodeFactory, Triple} +import org.apache.jena.sparql.util.TripleComparator import org.junit.Test import org.junit.runner.RunWith import org.junit.runners.Parameterized -import net.sansa_stack.inference.data.RDFTriple -import scala.collection.JavaConverters._ +import net.sansa_stack.inference.data.RDFTriple +import net.sansa_stack.inference.flink.data.RDFGraph /** * A test case for the computation of the transitive closure (TC). @@ -26,19 +29,24 @@ class RDFGraphTestCase(mode: TestExecutionMode) extends MultipleProgramsTestBase def testSubtract(): Unit = { val env = ExecutionEnvironment.getExecutionEnvironment + val s1 = NodeFactory.createURI("s1") + val p1 = NodeFactory.createURI("p1") + val o1 = NodeFactory.createURI("o1") + val o2 = NodeFactory.createURI("o2") + val o3 = NodeFactory.createURI("o3") // generate dataset val g1 = RDFGraph(env.fromCollection( Seq( - RDFTriple("s1", "p1", "o1"), - RDFTriple("s1", "p1", "o2"), - RDFTriple("s1", "p1", "o3") + Triple.create(s1, p1, o1), + Triple.create(s1, p1, o2), + Triple.create(s1, p1, o3) ) )) val g2 = RDFGraph(env.fromCollection( Seq( - RDFTriple("s1", "p1", "o1"), - RDFTriple("s1", "p1", "o2") + Triple.create(s1, p1, o1), + Triple.create(s1, p1, o2) ) )) @@ -47,17 +55,12 @@ class RDFGraphTestCase(mode: TestExecutionMode) extends MultipleProgramsTestBase val result = g_diff.triples.collect() val expected = Seq( - RDFTriple("s1", "p1", "o3") + Triple.create(s1, p1, o3) ) - TestBaseUtils.compareResultCollections(new util.ArrayList(result.asJava), new util.ArrayList(expected.asJava), new Comparator[RDFTriple] { - override def compare(t1: RDFTriple, t2: RDFTriple): Int = - ComparisonChain.start() - .compare(t1.s, t2.s) - .compare(t1.p, t2.p) - .compare(t1.o, t2.o) - .result() - }) + TestBaseUtils.compareResultCollections( + new util.ArrayList(result.asJava), + new util.ArrayList(expected.asJava), + new TripleComparator()) } - } diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/OWLHorstConformanceTest.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/OWLHorstConformanceTest.scala index fb7987b..81570ce 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/OWLHorstConformanceTest.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/OWLHorstConformanceTest.scala @@ -1,14 +1,14 @@ package net.sansa_stack.inference.flink.conformance -import net.sansa_stack.inference.flink.data.RDFGraphWriter -import net.sansa_stack.test.conformance.{IntegrationTestSuite, OWLHorstConformanceTestBase} +import scala.collection.mutable + import org.apache.flink.api.scala._ +import org.apache.jena.graph.Triple import org.apache.jena.rdf.model.Model -import net.sansa_stack.inference.data.{RDFTriple, SimpleRDFOps} -import net.sansa_stack.inference.flink.data.{RDFGraph, RDFGraphWriter} -import org.scalatest.Ignore -import scala.collection.mutable +import net.sansa_stack.inference.data.{Jena, JenaOps} +import net.sansa_stack.inference.flink.data.{RDFGraph, RDFGraphWriter} +import net.sansa_stack.test.conformance.OWLHorstConformanceTestBase /** * The class is to test the conformance of each materialization rule of OWL Horst entailment. @@ -16,10 +16,11 @@ import scala.collection.mutable * @author Lorenz Buehmann * */ -@IntegrationTestSuite -class OWLHorstConformanceTest extends OWLHorstConformanceTestBase(rdfOps = new SimpleRDFOps) with SharedOWLHorstReasonerContext{ +class OWLHorstConformanceTest + extends OWLHorstConformanceTestBase[Jena](rdfOps = new JenaOps) + with SharedOWLHorstReasonerContext{ - override def computeInferredModel(triples: mutable.HashSet[RDFTriple]): Model = { + override def computeInferredModel(triples: mutable.HashSet[Triple]): Model = { // distribute triples val triplesRDD = env.fromCollection(triples) diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/RDFSConformanceTest.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/RDFSConformanceTest.scala index cfa371d..fd7b3dc 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/RDFSConformanceTest.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/RDFSConformanceTest.scala @@ -1,14 +1,14 @@ package net.sansa_stack.inference.flink.conformance -import net.sansa_stack.inference.flink.data.RDFGraphWriter -import net.sansa_stack.test.conformance.{IntegrationTestSuite, RDFSConformanceTestBase} -import org.apache.jena.rdf.model.Model -import net.sansa_stack.inference.data.{RDFTriple, SimpleRDFOps} -import net.sansa_stack.inference.flink.data.RDFGraph +import scala.collection.mutable + import org.apache.flink.api.scala._ -import org.scalatest.Ignore +import org.apache.jena.graph.Triple +import org.apache.jena.rdf.model.Model -import scala.collection.mutable +import net.sansa_stack.inference.data.{Jena, JenaOps} +import net.sansa_stack.inference.flink.data.{RDFGraph, RDFGraphWriter} +import net.sansa_stack.test.conformance.RDFSConformanceTestBase /** * The class is to test the conformance of each materialization rule of RDFS(simple) entailment. @@ -16,10 +16,11 @@ import scala.collection.mutable * @author Lorenz Buehmann * */ -@IntegrationTestSuite -class RDFSConformanceTest extends RDFSConformanceTestBase(rdfOps = new SimpleRDFOps) with SharedRDFSReasonerContext{ +class RDFSConformanceTest + extends RDFSConformanceTestBase[Jena](rdfOps = new JenaOps) + with SharedRDFSReasonerContext{ - override def computeInferredModel(triples: mutable.HashSet[RDFTriple]): Model = { + override def computeInferredModel(triples: mutable.HashSet[Triple]): Model = { // distribute triples val triplesRDD = env.fromCollection(triples) diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/rules/TransitivityRuleTest.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/rules/TransitivityRuleTest.scala index 6e1940c..873d10f 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/rules/TransitivityRuleTest.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/rules/TransitivityRuleTest.scala @@ -1,13 +1,13 @@ package net.sansa_stack.inference.flink.rules -import net.sansa_stack.inference.flink.data.RDFGraphWriter -import net.sansa_stack.inference.flink.forwardchaining.ForwardRuleReasonerRDFS +import scala.collection.mutable + import org.apache.flink.api.scala.{ExecutionEnvironment, _} +import org.apache.jena.graph.{NodeFactory, Triple} import org.apache.jena.vocabulary.RDFS -import net.sansa_stack.inference.data.RDFTriple -import net.sansa_stack.inference.flink.data.{RDFGraph, RDFGraphWriter} -import scala.collection.mutable +import net.sansa_stack.inference.flink.data.{RDFGraph, RDFGraphWriter} +import net.sansa_stack.inference.flink.forwardchaining.ForwardRuleReasonerRDFS /** * A forward chaining implementation of the RDFS entailment regime. @@ -22,25 +22,24 @@ object TransitivityRuleTest { env.setParallelism(4) // generate graph - val triples = new mutable.HashSet[RDFTriple]() - val ns = "http://ex.org/" - val p1 = RDFS.subClassOf.getURI + val triples = new mutable.HashSet[Triple]() + val p1 = RDFS.subClassOf.asNode() val scale = 1 val begin = 1 val end = 10 * scale for(i <- begin to end) { - triples += RDFTriple(ns + "x" + i, p1, ns + "y" + i) - triples += RDFTriple(ns + "y" + i, p1, ns + "z" + i) - triples += RDFTriple(ns + "z" + i, p1, ns + "w" + i) + triples += Triple.create(NodeFactory.createURI("x" + i), p1, NodeFactory.createURI("y" + i)) + triples += Triple.create(NodeFactory.createURI("y" + i), p1, NodeFactory.createURI("z" + i)) + triples += Triple.create(NodeFactory.createURI("z" + i), p1, NodeFactory.createURI("w" + i)) } // graph is a path of length n // (x1, p, x2), (x2, p, x3), ..., (x(n-1), p, xn) val n = 10 for (i <- 1 to end) { - triples += RDFTriple(ns + "x" + i, p1, ns + "x" + (i + 1)) + triples += Triple.create(NodeFactory.createURI("x" + i), p1, NodeFactory.createURI("x" + (i + 1))) } val triplesDataset = env.fromCollection(triples) From 8719a31bce967da029fce85735a7900650767374 Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Sun, 16 Jun 2019 08:49:49 +0200 Subject: [PATCH 06/23] Use RDF layer for loading/writing --- .../inference/flink/data/RDFGraphLoader.scala | 30 +++---------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala index 3349929..131f443 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala @@ -2,13 +2,10 @@ package net.sansa_stack.inference.flink.data import java.net.URI -import scala.collection.JavaConverters._ import scala.language.implicitConversions +import net.sansa_stack.rdf.flink.io.ntriples.NTriplesReader import org.apache.flink.api.scala.{ExecutionEnvironment, _} -import org.apache.jena.riot.{Lang, RDFDataMgr} - -import net.sansa_stack.rdf.benchmark.io.ReadableByteChannelFromIterator /** @@ -28,27 +25,7 @@ object RDFGraphLoader { } def loadFromDisk(paths: Seq[URI], env: ExecutionEnvironment): RDFGraph = { -// // create a configuration object -// val parameters = new Configuration -// -// // set the recursive enumeration parameter -// parameters.setBoolean("recursive.file.enumeration", true) -// env.readTextFile(f).withParameters(parameters) - - val tmp: List[String] = paths.map(path => path.toString).toList - - val triples = tmp - .map(f => env.readTextFile(f)) // no support to read from multiple paths at once, thus, map + union here - .reduce(_ union _) // TODO Flink 1.5.0 supports multiple paths via FileInputFormat - .mapPartition(p => { - // convert iterator to input stream - val is = ReadableByteChannelFromIterator.toInputStream(p.asJava) - - RDFDataMgr.createIteratorTriples(is, Lang.NTRIPLES, null).asScala - }) - .name("triples") - - RDFGraph(triples) + RDFGraph(NTriplesReader.load(env, paths)) } def main(args: Array[String]): Unit = { @@ -56,7 +33,8 @@ object RDFGraphLoader { val path = args(0) - val env = ExecutionEnvironment.getExecutionEnvironment +// val env = ExecutionEnvironment.getExecutionEnvironment + val env = ExecutionEnvironment.createLocalEnvironment(parallelism = 2) val ds = RDFGraphLoader.loadFromDisk(path, env).triples From 4d99e4f8bb94dc1b4d1e474a2938885e6f403e7e Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Mon, 17 Jun 2019 11:33:31 +0200 Subject: [PATCH 07/23] Key type wrapper. --- .../inference/flink/utils/NodeKey.scala | 35 ++++++++++ .../inference/flink/utils/key/Key.scala | 18 +++++ .../inference/flink/utils/key/Key1.scala | 36 ++++++++++ .../inference/flink/utils/key/Key2.scala | 53 ++++++++++++++ .../inference/flink/utils/key/Key3.scala | 69 +++++++++++++++++++ 5 files changed, 211 insertions(+) create mode 100644 sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/NodeKey.scala create mode 100644 sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/key/Key.scala create mode 100644 sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/key/Key1.scala create mode 100644 sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/key/Key2.scala create mode 100644 sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/key/Key3.scala diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/NodeKey.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/NodeKey.scala new file mode 100644 index 0000000..3aadb99 --- /dev/null +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/NodeKey.scala @@ -0,0 +1,35 @@ +package net.sansa_stack.inference.flink.utils + +import org.apache.jena.graph.Node +import org.apache.jena.sparql.util.NodeComparator + +/** + * Key type wrapper for Jena `Node` objects. + * It basically makes Node comparable which is necessary to be handles as key in Flink. + * + * @author Lorenz Buehmann + */ +class NodeKey(val node: Node) extends Comparable[NodeKey] with Equals { + + override def compareTo(o: NodeKey): Int = { + val other = o.node + if (node == null) + if (other == null) 0 else -1 + else + if (other == null) 1 else new NodeComparator().compare(node, other) + } + + override def canEqual(that: Any): Boolean = that.isInstanceOf[NodeKey] + + override def hashCode(): Int = 31 * node.## + + override def equals(that: Any): Boolean = + that match { + case key: NodeKey => (this eq key) || (key.canEqual(this) && hashCode == key.hashCode) + case _ => false + } +} + +object NodeKey { + def apply(node: Node): NodeKey = new NodeKey(node) +} diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/key/Key.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/key/Key.scala new file mode 100644 index 0000000..aeada12 --- /dev/null +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/key/Key.scala @@ -0,0 +1,18 @@ +package net.sansa_stack.inference.flink.utils.key + +/** + * Base of a tuple-like generic key. + * + * @tparam T The type of the concrete key type. + */ +abstract class Key[T <: Key[T]] extends Comparable[T] { + /** + * Gets the i-th element of the tuple-like key. + * + * @param pos The position. + * @return The element at that key position; + */ + def get(pos: Int): Any +} + + diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/key/Key1.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/key/Key1.scala new file mode 100644 index 0000000..fcae711 --- /dev/null +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/key/Key1.scala @@ -0,0 +1,36 @@ +package net.sansa_stack.inference.flink.utils.key + +/** + * A key with one key field. + * + * @tparam T1 The type of the field. + */ +class Key1[T1 <: Comparable[T1]](val value1: T1) extends Key[Key1[T1]] with Equals { + + def get(pos: Int): Any = pos match { + case 0 => + value1 + case _ => + throw new IndexOutOfBoundsException + } + + override def hashCode: Int = if (value1 == null) 0 else value1.hashCode + + override def canEqual(that: Any): Boolean = that.isInstanceOf[Key1[T1]] + + override def equals(obj: Any): Boolean = + obj match { + case that: Key1[T1] => (this eq that) || (this.canEqual(that) && (value1 == that.value1)) + case _ => false + } + + override def toString: String = s"Key1 ($value1)" + + def compareTo(o: Key1[T1]): Int = { + val other = o.value1 + if (value1 == null) + if (other == null) 0 else -1 + else + if (other == null) 1 else value1.compareTo(other) + } +} diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/key/Key2.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/key/Key2.scala new file mode 100644 index 0000000..b8d808e --- /dev/null +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/key/Key2.scala @@ -0,0 +1,53 @@ +package net.sansa_stack.inference.flink.utils.key + +/** + * A key with two key fields. + * + * @tparam T1 The type of the first field. + * @tparam T2 The type of the second field. + */ +class Key2[T1 <: Comparable[T1], T2 <: Comparable[T2]](val value1: T1, val value2: T2) + extends Key[Key2[T1, T2]] + with Equals { + + def get(pos: Int): Any = pos match { + case 0 => + value1 + case 1 => + value2 + case _ => + throw new IndexOutOfBoundsException + } + + override def hashCode: Int = { + val c1: Int = if (value1 == null) 0 else value1.hashCode + val c2: Int = if (value2 == null) 0 else value2.hashCode + c1 * 17 + c2 * 31 + } + + override def canEqual(that: Any): Boolean = that.isInstanceOf[Key2[T1, T2]] + + override def equals(obj: Any): Boolean = + obj match { + case that: Key2[T1, T2] => (this eq that) || (this.canEqual(that) && (value1 == that.value1) && (value2 == that.value2)) + case _ => false + } + + override def toString: String = s"Key2 ($value1, $value2)" + + def compareTo(o: Key2[T1, T2]): Int = { + val other1 = o.value1 + val other2 = o.value2 + + val c1 = if (value1 == null) + if (other1 == null) 0 else -1 + else + if (other1 == null) 1 else value1.compareTo(other1) + + if(c1 != 0) c1 else + if (value2 == null) + if (other2 == null) 0 else -1 + else + if (other2 == null) 1 else value2.compareTo(other2) + } +} diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/key/Key3.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/key/Key3.scala new file mode 100644 index 0000000..e875b26 --- /dev/null +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/key/Key3.scala @@ -0,0 +1,69 @@ +package net.sansa_stack.inference.flink.utils.key + +/** + * A key with two key fields. + * + * @tparam T1 The type of the first field. + * @tparam T2 The type of the second field. + * @tparam T3 The type of the third field. + */ +class Key3[T1 <: Comparable[T1], T2 <: Comparable[T2], T3 <: Comparable[T3]](val value1: T1, val value2: T2, val value3: T3) + extends Key[Key3[T1, T2, T3]] + with Equals { + + def get(pos: Int): Any = pos match { + case 0 => + value1 + case 1 => + value2 + case 2 => + value3 + case _ => + throw new IndexOutOfBoundsException + } + + override def hashCode: Int = { + val c1: Int = if (value1 == null) 0 else value1.hashCode + val c2: Int = if (value2 == null) 0 else value2.hashCode + val c3: Int = if (value3 == null) 0 else value3.hashCode + c1 * 17 + c2 * 31 + c3 * 47 + } + + override def canEqual(that: Any): Boolean = that.isInstanceOf[Key3[T1, T2, T3]] + + override def equals(obj: Any): Boolean = + obj match { + case that: Key3[T1, T2, T3] => + (this eq that) || (this.canEqual(that) && (value1 == that.value1) && (value2 == that.value2) && (value3 == that.value3)) + case _ => false + } + + override def toString: String = s"Key3 ($value1, $value2, $value3)" + + def compareTo(o: Key3[T1, T2, T3]): Int = { + val other1 = o.value1 + val other2 = o.value2 + val other3 = o.value3 + + val c1 = if (value1 == null) + if (other1 == null) 0 else -1 + else + if (other1 == null) 1 else value1.compareTo(other1) + + if(c1 != 0) c1 else { + val c2 = if (value2 == null) + if (other2 == null) 0 else -1 + else + if (other2 == null) 1 else value2.compareTo(other2) + + if(c2 != 0) c2 else { + if (value3 == null) + if (other3 == null) 0 else -1 + else + if (other3 == null) 1 else value3.compareTo(other3) + } + + } + + } +} From fc072988366c3046f2f4f81ebdd1df6593b2125d Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Mon, 17 Jun 2019 11:35:35 +0200 Subject: [PATCH 08/23] Flink needs either key type or key selector function for join() and distinct() operators. --- .../ForwardRuleReasonerRDFS.scala | 2 +- .../forwardchaining/TransitiveReasoner.scala | 59 +++++++++++++++---- 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala index a1538b7..e17eeae 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala @@ -270,7 +270,7 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas .union( Seq(otherTriples, subClassOfTriplesTrans, subPropertyOfTriplesTrans, typeTriples, triplesRDFS7, triplesRDFS9) ) - .distinct() + .distinct(t => t.hashCode()) // we perform also additional rules if enabled if (level != SIMPLE) { diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/TransitiveReasoner.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/TransitiveReasoner.scala index 8fda1fd..23498ec 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/TransitiveReasoner.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/TransitiveReasoner.scala @@ -6,7 +6,9 @@ import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.scala.{DataSet, _} import org.apache.flink.util.Collector import org.apache.jena.graph.{Node, Triple} +import org.apache.jena.sparql.util.NodeComparator +import net.sansa_stack.inference.flink.utils.NodeKey import net.sansa_stack.inference.utils.Profiler /** @@ -186,28 +188,65 @@ trait TransitiveReasoner extends Profiler{ * @return a DataSet containing the transitive closure of the triples */ def computeTransitiveClosureOptSemiNaive(triples: DataSet[Triple]): DataSet[Triple] = { + + // apparently, we have to use pairs for (subject, object) because the Jena Triple is not a Scala tuple + // and we have to provide positions of key and value in the iterate method + // the initial set of edges is used as input for both, the workset and the solutionset + val initialTC = triples.map(t => (NodeKey(t.getSubject), NodeKey(t.getObject))) + val pred = triples.first(1).collect().head.getPredicate + log.info("computing TC...") - def iterate(s: DataSet[Triple], ws: DataSet[Triple]): (DataSet[Triple], DataSet[Triple]) = { - val resolvedRedirects = triples.join(ws) - .where { _.getSubject } - .equalTo { _.getObject } + def iterate(s: DataSet[(NodeKey, NodeKey)], ws: DataSet[(NodeKey, NodeKey)]) + : (DataSet[(NodeKey, NodeKey)], DataSet[(NodeKey, NodeKey)]) = { + val resolvedRedirects = initialTC.join(ws) + .where(0) + .equalTo(1) .map { joinResult => joinResult match { - case (redirect, link) => - Triple.create(link.getSubject, redirect.getPredicate, redirect.getObject) + case (redirect, link) => (link._1, redirect._2) } }.name("TC-From-Iteration") (resolvedRedirects, resolvedRedirects) } - val tc = triples - .iterateDelta(triples, 10, Array("s", "o"))(iterate) + val tc = initialTC + .iterateDelta(initialTC, 10, Array(0))(iterate) .name("Final-TC") log.info("finished computing TC") // .map { cl => cl} // .name("Final-Redirect-Result") - tc + tc.map(t => Triple.create(t._1.node, pred, t._2.node)) } - +// /** +// * Computes the transitive closure on a DataSet of triples. +// * Note, that the assumption is that all triples do have the same predicate. +// * This implementation uses the Flink iterate operator (see +// * [[https://ci.apache.org/projects/flink/flink-docs-master/dev/batch/iterations.html"]]) +// * +// * @param triples the DataSet of triples +// * @return a DataSet containing the transitive closure of the triples +// */ +// def computeTransitiveClosureOptSemiNaive(triples: DataSet[Triple]): DataSet[Triple] = { +// log.info("computing TC...") +// def iterate(s: DataSet[Triple], ws: DataSet[Triple]): (DataSet[Triple], DataSet[Triple]) = { +// val resolvedRedirects = triples.join(ws) +// .where { _.getSubject } +// .equalTo { _.getObject } +// .map { joinResult => joinResult match { +// case (redirect, link) => +// Triple.create(link.getSubject, redirect.getPredicate, redirect.getObject) +// } +// }.name("TC-From-Iteration") +// (resolvedRedirects, resolvedRedirects) +// } +// +// val tc = triples +// .iterateDelta(triples, 10, Array("s", "o"))(iterate) +// .name("Final-TC") +// log.info("finished computing TC") +// // .map { cl => cl} +// // .name("Final-Redirect-Result") +// tc +// } } From be1bf6f9a646ed4ccc570a87b9049e900dae6797 Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Mon, 17 Jun 2019 11:36:33 +0200 Subject: [PATCH 09/23] Minor changes in main entry class, e.g. always write to disk and bumped CLI version. --- .../inference/flink/RDFGraphMaterializer.scala | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/RDFGraphMaterializer.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/RDFGraphMaterializer.scala index 43797a1..da699eb 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/RDFGraphMaterializer.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/RDFGraphMaterializer.scala @@ -21,7 +21,8 @@ import net.sansa_stack.inference.rules.ReasoningProfile._ import net.sansa_stack.inference.rules.{RDFSLevel, ReasoningProfile} /** - * The class to compute the RDFS materialization of a given RDF graph. + * A class to compute the materialization of a given RDF graph for a given reasoning profile. + * Basically, used as the main class for inference. * * @author Lorenz Buehmann * @@ -66,6 +67,7 @@ object RDFGraphMaterializer { // set up the execution environment val env = ExecutionEnvironment.getExecutionEnvironment + // and disable logging to standard out env.getConfig.disableSysoutLogging() // env.setParallelism(4) @@ -89,10 +91,10 @@ object RDFGraphMaterializer { // compute inferred graph val inferredGraph = reasoner.apply(graph) - println(s"|G_inf| = ${inferredGraph.size}") +// println(s"|G_inf| = ${inferredGraph.size}") // write triples to disk -// RDFGraphWriter.writeToDisk(inferredGraph, output, writeToSingleFile, sortedOutput) + RDFGraphWriter.writeToDisk(inferredGraph, output, writeToSingleFile, sortedOutput) // println(env.getExecutionPlan()) @@ -118,7 +120,7 @@ object RDFGraphMaterializer { // the CLI parser val parser = new scopt.OptionParser[Config]("RDFGraphMaterializer") { - head("RDFGraphMaterializer", "0.4.0") + head("RDFGraphMaterializer", "0.5.0") // opt[Seq[File]]('i', "input").required().valueName(",,..."). // action((x, c) => c.copy(in = x)). From 095b5a05ea4432c455eb893f523b9364be5821be Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Thu, 27 Jun 2019 10:52:08 +0200 Subject: [PATCH 10/23] boolean function utils. --- .../scala/net/sansa_stack/inference/utils/PredicateUtils.scala | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/PredicateUtils.scala diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/PredicateUtils.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/PredicateUtils.scala new file mode 100644 index 0000000..e69de29 From 04effc960bc50310011fa5ad8aa1eeef117aad47 Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Thu, 27 Jun 2019 10:52:54 +0200 Subject: [PATCH 11/23] POM cleanup --- pom.xml | 3 +-- sansa-inference-flink/pom.xml | 21 +++++++++++---------- sansa-inference-spark/pom.xml | 2 -- sansa-inference-tests/pom.xml | 2 +- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/pom.xml b/pom.xml index 9453ca2..73ca809 100644 --- a/pom.xml +++ b/pom.xml @@ -1,6 +1,5 @@ - + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 diff --git a/sansa-inference-flink/pom.xml b/sansa-inference-flink/pom.xml index 4a06072..049d7de 100644 --- a/sansa-inference-flink/pom.xml +++ b/sansa-inference-flink/pom.xml @@ -35,12 +35,12 @@ under the License. net.sansa-stack sansa-inference-common_${scala.binary.version} - ${parent.version} + ${project.parent.version} net.sansa-stack sansa-inference-tests_${scala.binary.version} - ${parent.version} + ${project.parent.version} test-jar test @@ -49,6 +49,7 @@ under the License. net.sansa-stack sansa-rdf-flink_${scala.binary.version} + ${project.parent.version} @@ -56,14 +57,14 @@ under the License. org.apache.flink flink-scala_${scala.binary.version} - - org.apache.flink - flink-streaming-scala_${scala.binary.version} - - - org.apache.flink - flink-clients_${scala.binary.version} - + + + + + + + + diff --git a/sansa-inference-spark/pom.xml b/sansa-inference-spark/pom.xml index 1e1882a..a440cbc 100644 --- a/sansa-inference-spark/pom.xml +++ b/sansa-inference-spark/pom.xml @@ -14,8 +14,6 @@ compile - - 0.3.1-SNAPSHOT diff --git a/sansa-inference-tests/pom.xml b/sansa-inference-tests/pom.xml index 8cc02d9..05c4b34 100644 --- a/sansa-inference-tests/pom.xml +++ b/sansa-inference-tests/pom.xml @@ -15,7 +15,7 @@ net.sansa-stack sansa-inference-common_${scala.binary.version} - ${parent.version} + ${project.parent.version} From d5e6b6ce831a8ebfa497b85c0ff5c8cfabad8a22 Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Thu, 27 Jun 2019 10:58:04 +0200 Subject: [PATCH 12/23] Utils simplified --- .../utils/JenaTripleToNTripleString.scala | 22 +++------- .../utils/NTriplesStringToRDFTriple.scala | 3 +- .../inference/utils/PredicateUtils.scala | 40 +++++++++++++++++++ 3 files changed, 46 insertions(+), 19 deletions(-) diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/JenaTripleToNTripleString.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/JenaTripleToNTripleString.scala index b5dee64..b94a842 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/JenaTripleToNTripleString.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/JenaTripleToNTripleString.scala @@ -1,33 +1,21 @@ package net.sansa_stack.inference.utils import org.apache.jena.graph.Triple +import org.apache.jena.shared.PrefixMapping +import org.apache.jena.sparql.util.FmtUtils /** * Convert a Jena Triple to an N-Triples string. * + * @note it turns out, that it might be more efficient to use the Jena stream based writer API per partition. + * * @author Lorenz Buehmann */ class JenaTripleToNTripleString extends Function[Triple, String] with java.io.Serializable { - override def apply(t: Triple): String = { - val subStr = - if (t.getSubject.isBlank) { - s"_:${t.getSubject.getBlankNodeLabel}" - } else { - s"<${t.getSubject.getURI}>" - } - val objStr = - if (t.getObject.isLiteral) { - t.getObject - } else if (t.getObject.isBlank) { - s"_:${t.getObject}" - } else { - s"<${t.getObject}>" - } - s"$subStr <${t.getPredicate}> $objStr ." - } + override def apply(t: Triple): String = s"${FmtUtils.stringForTriple(t, null.asInstanceOf[PrefixMapping])} ." } diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/NTriplesStringToRDFTriple.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/NTriplesStringToRDFTriple.scala index 643014c..2c978e6 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/NTriplesStringToRDFTriple.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/NTriplesStringToRDFTriple.scala @@ -2,7 +2,6 @@ package net.sansa_stack.inference.utils import java.io.ByteArrayInputStream -import org.apache.jena.graph.Triple import org.apache.jena.riot.{Lang, RDFDataMgr} import net.sansa_stack.inference.data.RDFTriple @@ -13,7 +12,7 @@ import net.sansa_stack.inference.data.RDFTriple * @author Lorenz Buehmann */ class NTriplesStringToRDFTriple - extends Function1[String, Option[RDFTriple]] + extends ((String) => Option[RDFTriple]) with java.io.Serializable { override def apply(s: String): Option[RDFTriple] = { val t = RDFDataMgr.createIteratorTriples(new ByteArrayInputStream(s.getBytes), Lang.NTRIPLES, null).next() diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/PredicateUtils.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/PredicateUtils.scala index e69de29..12e55bc 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/PredicateUtils.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/PredicateUtils.scala @@ -0,0 +1,40 @@ +package net.sansa_stack.inference.utils + +/** + * Some utils for logical combinations of boolean functions. + */ +object PredicateUtils { + + implicit class RichPredicate[A](f: A => Boolean) extends (A => Boolean) { + def apply(v: A): Boolean = f(v) + + /** + * Logical 'and'. + * + * @param g + * @return + */ + def &&(g: A => Boolean): A => Boolean = { x: A => + f(x) && g(x) + } + + /** + * Logical 'or'. + * + * @param g + * @return + */ + def ||(g: A => Boolean): A => Boolean = { x: A => + f(x) || g(x) + } + + /** + * Logical 'not' + * + * @return + */ + def unary_! : A => Boolean = { x: A => + !f(x) + } + } +} From 7cf311cf0a672be64ee038fc808a46a825427d39 Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Thu, 27 Jun 2019 10:59:11 +0200 Subject: [PATCH 13/23] Extended test output if test fails. --- .../test/conformance/ConformanceTestBase.scala | 18 ++++++++++++------ .../conformance/RDFSConformanceTestBase.scala | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/ConformanceTestBase.scala b/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/ConformanceTestBase.scala index 7c37c7a..a014a14 100644 --- a/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/ConformanceTestBase.scala +++ b/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/ConformanceTestBase.scala @@ -1,17 +1,14 @@ package net.sansa_stack.test.conformance -import java.io.{File, StringWriter} -import java.nio.file.{Path, Paths} +import scala.collection.mutable -import net.sansa_stack.inference.data.{RDF, RDFOps} import org.apache.jena.rdf.model.Model import org.apache.jena.shared.PrefixMapping import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfterAll, FlatSpec} -import scala.collection.mutable -import net.sansa_stack.test.conformance.TestCases.getClass +import net.sansa_stack.inference.data.{RDF, RDFOps} /** * The class is to test the conformance of each materialization rule of RDFS(simple) entailment. @@ -88,10 +85,19 @@ abstract class ConformanceTestBase[Rdf <: RDF](val rdfOps: RDFOps[Rdf]) extends // compare models, i.e. the inferred model should contain exactly the triples of the conclusion graph val correctOutput = inferredModel.containsAll(testCase.outputGraph) + if(!correctOutput) { + println("Missing triples in inferred graph:") + testCase.outputGraph.difference(inferredModel).write(System.out, "TURTLE") + } assert(correctOutput, "contains all expected triples") + val isomorph = inferredModel.isIsomorphicWith(testCase.outputGraph) - assert(isomorph) + if(!isomorph) { + println("inferred graph:") + inferredModel.write(System.out, "TURTLE") + } + assert(isomorph, "input and output are isomorph") } } diff --git a/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/RDFSConformanceTestBase.scala b/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/RDFSConformanceTestBase.scala index df2a03e..f385290 100644 --- a/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/RDFSConformanceTestBase.scala +++ b/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/RDFSConformanceTestBase.scala @@ -5,7 +5,7 @@ import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner /** - * The class is to test the conformance of each materialization rule of RDFS(simple) entailment. + * The base class to test the conformance of each materialization rule of RDFS(simple) entailment. * * @author Lorenz Buehmann * From f66a9e083280640b4b62dae1a7e3f9d7c4a861db Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Thu, 27 Jun 2019 10:59:50 +0200 Subject: [PATCH 14/23] Simplified conversion from RDD[Triple] to Jena Model --- .../spark/data/writer/RDFGraphWriter.scala | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/writer/RDFGraphWriter.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/writer/RDFGraphWriter.scala index 5b3c204..286f61e 100644 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/writer/RDFGraphWriter.scala +++ b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/writer/RDFGraphWriter.scala @@ -11,7 +11,8 @@ import org.slf4j.LoggerFactory import net.sansa_stack.inference.spark.data.model.RDFGraph import net.sansa_stack.inference.utils.{JenaTripleToNTripleString, RDFTripleOrdering, RDFTripleToNTripleString} -import org.apache.jena.graph.{Node, NodeFactory, Triple} +import org.apache.jena.graph.{GraphUtil, Node, NodeFactory, Triple} +import org.apache.jena.rdf.model.impl.StatementImpl import org.apache.jena.sparql.util.TripleComparator /** @@ -118,16 +119,10 @@ object RDFGraphWriter { * @return the in-memory Apache Jena model containing the triples */ def convertToModel(graph: RDFGraph): Model = { - val modelString = graph.triples - .map(new JenaTripleToNTripleString()) - .collect() - .mkString("\n") + val triples = graph.triples.collect() val model = ModelFactory.createDefaultModel() - - if (!modelString.trim.isEmpty) { - model.read(new ByteArrayInputStream(modelString.getBytes(StandardCharsets.UTF_8)), null, "N-TRIPLES") - } + GraphUtil.add(model.getGraph, triples) model } From 79e2e3a17008bdcac977bc456c5aca1360153051 Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Thu, 27 Jun 2019 11:23:43 +0200 Subject: [PATCH 15/23] minor changes in I/O --- .../inference/flink/data/RDFGraphLoader.scala | 2 +- .../inference/flink/data/RDFGraphWriter.scala | 34 +++++++++++-------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala index 131f443..7e753fd 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala @@ -25,7 +25,7 @@ object RDFGraphLoader { } def loadFromDisk(paths: Seq[URI], env: ExecutionEnvironment): RDFGraph = { - RDFGraph(NTriplesReader.load(env, paths)) + RDFGraph(NTriplesReader.load(env, paths).name("triples")) } def main(args: Array[String]): Unit = { diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphWriter.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphWriter.scala index ed085b8..f2e4f19 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphWriter.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphWriter.scala @@ -7,7 +7,9 @@ import java.nio.charset.StandardCharsets import org.apache.flink.api.common.operators.Order import org.apache.flink.api.scala._ import org.apache.flink.core.fs.FileSystem +import org.apache.jena.graph.GraphUtil import org.apache.jena.rdf.model.{Model, ModelFactory} +import org.apache.jena.sparql.graph.GraphFactory import org.apache.jena.sparql.util.TripleComparator import org.slf4j.LoggerFactory @@ -52,32 +54,36 @@ object RDFGraphWriter { // sort triples if enabled val tmp = if (sorted) { - graph.triples// .sortPartition(t => t, Order.ASCENDING) // map(t => (t, t)).sortPartition(1, Order.DESCENDING).map(_._1) + graph.triples.sortPartition(_.hashCode(), Order.ASCENDING) } else { graph.triples } - if (singleFile) { - tmp.setParallelism(1) - } - - tmp + val sink = tmp .map(new JenaTripleToNTripleString()) // to N-TRIPLES string .writeAsText(path.toString, writeMode = FileSystem.WriteMode.OVERWRITE) + // write to single file if enabled + if (singleFile) { + sink.setParallelism(1) + } + logger.info("finished writing triples to disk in " + (System.currentTimeMillis()-startTime) + "ms.") } + /** + * Converts an RDF graph to an Apache Jena in-memory model. + * + * @note For large graphs this can be too expensive + * and lead to a OOM exception + * + * @param graph the RDF graph + * + * @return the in-memory Apache Jena model containing the triples + */ def convertToModel(graph: RDFGraph) : Model = { - val modelString = graph.triples.map(new JenaTripleToNTripleString()) - .collect().mkString("\n") - val model = ModelFactory.createDefaultModel() - - if(!modelString.trim.isEmpty) { - model.read(new ByteArrayInputStream(modelString.getBytes(StandardCharsets.UTF_8)), null, "N-TRIPLES") - } - + GraphUtil.add(model.getGraph, graph.triples.collect().toArray) model } } From 9da7479e12b368533f337e89ecbd76cb9176eeb1 Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Thu, 27 Jun 2019 11:24:11 +0200 Subject: [PATCH 16/23] subtraction operation for Flink DataSet with Jena Triple --- .../inference/flink/utils/DataSetUtils.scala | 27 +++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/DataSetUtils.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/DataSetUtils.scala index eb9df95..5eb5997 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/DataSetUtils.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/utils/DataSetUtils.scala @@ -32,14 +32,37 @@ object DataSetUtils { } /** - * Return a DataSet with the elements from this that are not in `other`. + * Returns a DataSet with the elements from this that are not in `other`. * * @param other the DataSet containing the element to be subtracted * @return the DataSet */ def subtract(other: DataSet[T]): DataSet[T] = { - dataset.coGroup(other).where("*").equalTo("*")(new MinusCoGroupFunction[T](true)).name("subtract") + dataset.coGroup(other).where("*").equalTo("*")( + new MinusCoGroupFunction[T](true)) + .name("subtract") } + + import scala.reflect._ + /** + * Returns a DataSet with the elements from this that are not in `other`. + * A key selector function for both datasets has to be given. + * + * @param other the DataSet containing the element to be subtracted + * @return the DataSet + */ + def subtract[K: ClassTag : TypeInformation](other: DataSet[T], keySelectorThis: (T) => K, keySelectorOther: (T) => K): DataSet[T] = { + + val typeInfo = TypeInformation.of(classTag[K].runtimeClass).asInstanceOf[TypeInformation[K]] + dataset.coGroup(other) + .where(keySelectorThis) + .equalTo(keySelectorOther)(typeInfo)( + new MinusCoGroupFunction[T](true)) + .name("subtract") + } + + + } } From 2e84e5b2068bd9026d2f9f39e642f78e3c43976a Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Thu, 27 Jun 2019 11:25:16 +0200 Subject: [PATCH 17/23] Improved Flink conformance test setup --- .../SharedOWLHorstReasonerContext.scala | 11 +---- .../SharedRDFSReasonerContext.scala | 13 ++--- .../conformance/SharedReasonerContext.scala | 48 +++++++++++++++++++ 3 files changed, 53 insertions(+), 19 deletions(-) create mode 100644 sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedReasonerContext.scala diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedOWLHorstReasonerContext.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedOWLHorstReasonerContext.scala index 279596c..908164c 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedOWLHorstReasonerContext.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedOWLHorstReasonerContext.scala @@ -13,20 +13,13 @@ import org.scalatest.{BeforeAndAfterAll, Suite} * @author Lorenz Buehmann */ @RunWith(classOf[Parameterized]) -trait SharedOWLHorstReasonerContext extends BeforeAndAfterAll with ReasonerContextProvider{ +trait SharedOWLHorstReasonerContext + extends SharedReasonerContext[ForwardRuleReasonerOWLHorst] { self: Suite => - @transient private var _reasoner: ForwardRuleReasonerOWLHorst = _ - - val reasoner: ForwardRuleReasoner = _reasoner - - @transient private var _env: ExecutionEnvironment = _ - def env: ExecutionEnvironment = _env - override def beforeAll(): Unit = { super.beforeAll() - _env = ExecutionEnvironment.getExecutionEnvironment _reasoner = new ForwardRuleReasonerOWLHorst(env) } } diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedRDFSReasonerContext.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedRDFSReasonerContext.scala index d2016a5..95f76d6 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedRDFSReasonerContext.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedRDFSReasonerContext.scala @@ -3,6 +3,7 @@ package net.sansa_stack.inference.flink.conformance import net.sansa_stack.inference.flink.forwardchaining.{ForwardRuleReasoner, ForwardRuleReasonerRDFS} import net.sansa_stack.inference.rules.RDFSLevel import org.apache.flink.api.scala.ExecutionEnvironment +import org.apache.flink.test.util.AbstractTestBase import org.junit.runner.RunWith import org.junit.runners.Parameterized import org.scalatest.{BeforeAndAfterAll, Suite} @@ -13,20 +14,12 @@ import org.scalatest.{BeforeAndAfterAll, Suite} * @author Lorenz Buehmann */ @RunWith(classOf[Parameterized]) -trait SharedRDFSReasonerContext extends BeforeAndAfterAll with ReasonerContextProvider{ +trait SharedRDFSReasonerContext + extends SharedReasonerContext[ForwardRuleReasonerRDFS] { self: Suite => - @transient private var _reasoner: ForwardRuleReasonerRDFS = _ - def reasoner: ForwardRuleReasoner = _reasoner - - @transient private var _env: ExecutionEnvironment = _ - def env: ExecutionEnvironment = _env - - override def beforeAll(): Unit = { super.beforeAll() - _env = ExecutionEnvironment.getExecutionEnvironment - _env.getConfig.disableSysoutLogging() _reasoner = new ForwardRuleReasonerRDFS(env) _reasoner.level = RDFSLevel.SIMPLE } diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedReasonerContext.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedReasonerContext.scala new file mode 100644 index 0000000..701698e --- /dev/null +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedReasonerContext.scala @@ -0,0 +1,48 @@ +package net.sansa_stack.inference.flink.conformance + +import net.sansa_stack.rdf.common.kryo.jena.JenaKryoSerializers.{NodeSerializer, TripleSerializer} +import org.apache.flink.api.scala.ExecutionEnvironment +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration +import org.apache.jena.graph.{Node, Triple} +import org.scalatest.{BeforeAndAfterAll, Suite} + +import net.sansa_stack.inference.flink.forwardchaining.ForwardRuleReasoner + +/** + * A shared reasoner and Flink environment used for multiple test cases using the same resources. + * + * @author Lorenz Buehmann + */ +trait SharedReasonerContext[R <: ForwardRuleReasoner] + extends BeforeAndAfterAll + with ReasonerContextProvider { + self: Suite => + + @transient protected var _reasoner: R = _ + def reasoner: R = _reasoner + + @transient private var _env: ExecutionEnvironment = _ + def env: ExecutionEnvironment = _env + + override def beforeAll(): Unit = { + super.beforeAll() + _env = ExecutionEnvironment.getExecutionEnvironment + _env.setParallelism(4) + _env.getConfig.disableSysoutLogging() + _env.getConfig.addDefaultKryoSerializer(classOf[Triple], classOf[TripleSerializer]) + _env.getConfig.addDefaultKryoSerializer(classOf[Node], classOf[NodeSerializer]) + } + + import org.apache.flink.test.util.MiniClusterWithClientResource + import org.junit.ClassRule + + private val DEFAULT_PARALLELISM = 4 + + @ClassRule val miniClusterResource = new MiniClusterWithClientResource( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(1) + .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) + .build + ) + +} From 89394b0727bd1d3eb62dfbb47bf2dc7ccc72717d Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Thu, 27 Jun 2019 14:19:46 +0200 Subject: [PATCH 18/23] log output of test base class --- .../conformance/ConformanceTestBase.scala | 28 ++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/ConformanceTestBase.scala b/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/ConformanceTestBase.scala index a014a14..6f779b0 100644 --- a/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/ConformanceTestBase.scala +++ b/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/ConformanceTestBase.scala @@ -1,7 +1,10 @@ package net.sansa_stack.test.conformance +import java.io.ByteArrayOutputStream + import scala.collection.mutable +import com.typesafe.scalalogging.LazyLogging import org.apache.jena.rdf.model.Model import org.apache.jena.shared.PrefixMapping import org.junit.runner.RunWith @@ -17,9 +20,10 @@ import net.sansa_stack.inference.data.{RDF, RDFOps} * */ @RunWith(classOf[JUnitRunner]) -abstract class ConformanceTestBase[Rdf <: RDF](val rdfOps: RDFOps[Rdf]) extends FlatSpec with BeforeAndAfterAll { - - val logger = com.typesafe.scalalogging.Logger("ConformanceTestBase") +abstract class ConformanceTestBase[Rdf <: RDF](val rdfOps: RDFOps[Rdf]) + extends FlatSpec + with BeforeAndAfterAll + with LazyLogging { behavior of "" @@ -86,20 +90,30 @@ abstract class ConformanceTestBase[Rdf <: RDF](val rdfOps: RDFOps[Rdf]) extends // compare models, i.e. the inferred model should contain exactly the triples of the conclusion graph val correctOutput = inferredModel.containsAll(testCase.outputGraph) if(!correctOutput) { - println("Missing triples in inferred graph:") - testCase.outputGraph.difference(inferredModel).write(System.out, "TURTLE") + logger.whenErrorEnabled { + logger.error("Missing triples in inferred graph:\n {}", toNTriplesString(testCase.outputGraph.difference(inferredModel))) + } } assert(correctOutput, "contains all expected triples") val isomorph = inferredModel.isIsomorphicWith(testCase.outputGraph) if(!isomorph) { - println("inferred graph:") - inferredModel.write(System.out, "TURTLE") + logger.whenErrorEnabled { + logger.error(s"Inferred graph not isomorph to target graph. Inferred triples:\n{}", toNTriplesString(inferredModel)) + } + } assert(isomorph, "input and output are isomorph") } } + private def toNTriplesString(model: Model): String = { + val baos = new ByteArrayOutputStream() + model.write(baos, "N-Triples") + val s = new String(baos.toByteArray) + s + } + def computeInferredModel(triples: mutable.HashSet[Rdf#Triple]): Model } From af1ff9451cfb80130dac597ffae293532c76652a Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Fri, 28 Jun 2019 09:28:06 +0200 Subject: [PATCH 19/23] Flink conformance test clean up --- .../flink/conformance/OWLHorstConformanceTest.scala | 8 +++----- .../flink/conformance/RDFSConformanceTest.scala | 10 ++++------ .../conformance/SharedOWLHorstReasonerContext.scala | 7 +++---- .../flink/conformance/SharedReasonerContext.scala | 8 ++++++++ .../inference/flink/rules/TransitivityRuleTest.scala | 2 +- 5 files changed, 19 insertions(+), 16 deletions(-) diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/OWLHorstConformanceTest.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/OWLHorstConformanceTest.scala index 81570ce..5609870 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/OWLHorstConformanceTest.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/OWLHorstConformanceTest.scala @@ -22,17 +22,15 @@ class OWLHorstConformanceTest override def computeInferredModel(triples: mutable.HashSet[Triple]): Model = { // distribute triples - val triplesRDD = env.fromCollection(triples) + val triplesDS = env.fromCollection(triples) // create graph - val graph = RDFGraph(triplesRDD) + val graph = RDFGraph(triplesDS) // compute inferred graph val inferredGraph = reasoner.apply(graph) - inferredGraph.triples.print() - - // convert to JENA model + // convert to Jena model val inferredModel = RDFGraphWriter.convertToModel(inferredGraph) inferredModel diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/RDFSConformanceTest.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/RDFSConformanceTest.scala index fd7b3dc..4222385 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/RDFSConformanceTest.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/RDFSConformanceTest.scala @@ -11,7 +11,7 @@ import net.sansa_stack.inference.flink.data.{RDFGraph, RDFGraphWriter} import net.sansa_stack.test.conformance.RDFSConformanceTestBase /** - * The class is to test the conformance of each materialization rule of RDFS(simple) entailment. + * The class is used to check the conformance of each materialization rule of RDFS(simple) entailment. * * @author Lorenz Buehmann * @@ -22,17 +22,15 @@ class RDFSConformanceTest override def computeInferredModel(triples: mutable.HashSet[Triple]): Model = { // distribute triples - val triplesRDD = env.fromCollection(triples) + val triplesDS = env.fromCollection(triples) // create graph - val graph = RDFGraph(triplesRDD) + val graph = RDFGraph(triplesDS) // compute inferred graph val inferredGraph = reasoner.apply(graph) - inferredGraph.triples.print() - - // convert to JENA model + // convert to Jena model val inferredModel = RDFGraphWriter.convertToModel(inferredGraph) inferredModel diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedOWLHorstReasonerContext.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedOWLHorstReasonerContext.scala index 908164c..4059416 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedOWLHorstReasonerContext.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedOWLHorstReasonerContext.scala @@ -1,14 +1,13 @@ package net.sansa_stack.inference.flink.conformance -import net.sansa_stack.inference.flink.forwardchaining.ForwardRuleReasoner -import org.apache.flink.api.scala.ExecutionEnvironment import org.junit.runner.RunWith import org.junit.runners.Parameterized +import org.scalatest.Suite + import net.sansa_stack.inference.flink.forwardchaining.ForwardRuleReasonerOWLHorst -import org.scalatest.{BeforeAndAfterAll, Suite} /** - * Test context to share an RDFS reasoner. + * Test context to share an OWL Horst reasoner. * * @author Lorenz Buehmann */ diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedReasonerContext.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedReasonerContext.scala index 701698e..a0f49c9 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedReasonerContext.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/conformance/SharedReasonerContext.scala @@ -3,6 +3,7 @@ package net.sansa_stack.inference.flink.conformance import net.sansa_stack.rdf.common.kryo.jena.JenaKryoSerializers.{NodeSerializer, TripleSerializer} import org.apache.flink.api.scala.ExecutionEnvironment import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration +import org.apache.flink.test.util.MiniClusterWithClientResource import org.apache.jena.graph.{Node, Triple} import org.scalatest.{BeforeAndAfterAll, Suite} @@ -26,6 +27,13 @@ trait SharedReasonerContext[R <: ForwardRuleReasoner] override def beforeAll(): Unit = { super.beforeAll() + + val miniClusterResource = new MiniClusterWithClientResource( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(1) + .setNumberSlotsPerTaskManager(DEFAULT_PARALLELISM) + .build) + _env = ExecutionEnvironment.getExecutionEnvironment _env.setParallelism(4) _env.getConfig.disableSysoutLogging() diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/rules/TransitivityRuleTest.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/rules/TransitivityRuleTest.scala index 873d10f..ad57339 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/rules/TransitivityRuleTest.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/rules/TransitivityRuleTest.scala @@ -10,7 +10,7 @@ import net.sansa_stack.inference.flink.data.{RDFGraph, RDFGraphWriter} import net.sansa_stack.inference.flink.forwardchaining.ForwardRuleReasonerRDFS /** - * A forward chaining implementation of the RDFS entailment regime. + * Test class to check transitivity chain rule is applied correctly in the RDFS reasoner. * * @author Lorenz Buehmann */ From 8a4d6d3c164c8a11cbdee5bcca216382e06172fc Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Fri, 28 Jun 2019 09:29:05 +0200 Subject: [PATCH 20/23] Flink reasoning on Triple DataSet --- .../forwardchaining/ForwardRuleReasoner.scala | 42 ++- .../ForwardRuleReasonerOWLHorst.scala | 292 ++++++++++++------ .../ForwardRuleReasonerRDFS.scala | 10 +- .../forwardchaining/TransitiveReasoner.scala | 47 ++- 4 files changed, 270 insertions(+), 121 deletions(-) diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasoner.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasoner.scala index 45e5cca..50833fc 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasoner.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasoner.scala @@ -5,6 +5,8 @@ import org.apache.flink.api.scala.DataSet import scala.collection.mutable import org.apache.jena.graph.{Node, Triple} +import org.apache.jena.shared.PrefixMapping +import org.apache.jena.sparql.util.FmtUtils /** * A forward chaining based reasoner. @@ -50,7 +52,7 @@ trait ForwardRuleReasoner extends TransitiveReasoner{ * @return the DataSet of triples that contain the predicate */ def extractTriples(triples: DataSet[Triple], predicate: Node): DataSet[Triple] = { - triples.filter(triple => triple.predicateMatches(predicate)) + triples.filter(triple => triple.predicateMatches(predicate)).name(s"${FmtUtils.stringForNode(predicate)} triples") } /** @@ -66,21 +68,33 @@ trait ForwardRuleReasoner extends TransitiveReasoner{ subject: Option[Node], predicate: Option[Node], obj: Option[Node]): DataSet[Triple] = { - var extractedTriples = triples - - if(subject.isDefined) { - extractedTriples = extractedTriples.filter(triple => triple.subjectMatches(subject.get)) - } - - if(predicate.isDefined) { - extractedTriples = extractedTriples.filter(triple => triple.predicateMatches(predicate.get)) - } +// import net.sansa_stack.inference.utils.PredicateUtils._ +// var extractedTriples = triples +// var filter = (t: Triple) => true +// +// if(subject.isDefined) { +// filter = filter || (_.subjectMatches(subject.get)) +//// extractedTriples = extractedTriples.filter(triple => triple.subjectMatches(subject.get)) +// } +// +// if(predicate.isDefined) { +// filter = filter || (_.predicateMatches(predicate.get)) +//// extractedTriples = extractedTriples.filter(triple => triple.predicateMatches(predicate.get)) +// } +// +// if(obj.isDefined) { +// filter = filter || (_.objectMatches(obj.get)) +//// extractedTriples = extractedTriples.filter(triple => triple.objectMatches(obj.get)) +// } +// +// extractedTriples.filter(filter) - if(obj.isDefined) { - extractedTriples = extractedTriples.filter(triple => triple.objectMatches(obj.get)) - } + val filterFct = (t: Triple) => + t.subjectMatches(subject.orNull) || + t.predicateMatches(predicate.orNull) || + t.objectMatches(obj.orNull) - extractedTriples + triples.filter(filterFct) } } diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerOWLHorst.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerOWLHorst.scala index 2093e52..d9a3a10 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerOWLHorst.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerOWLHorst.scala @@ -1,12 +1,20 @@ package net.sansa_stack.inference.flink.forwardchaining +import org.apache.flink.api.common.functions.{GroupReduceFunction, JoinFunction, RichFilterFunction} +import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment, _} +import org.apache.flink.util.Collector import org.apache.jena.graph.{Node, Triple} import org.apache.jena.vocabulary.{OWL2, RDF, RDFS} import org.slf4j.LoggerFactory import net.sansa_stack.inference.flink.data.RDFGraph import net.sansa_stack.inference.utils.CollectionUtils +import java.lang.Iterable + +import scala.collection.JavaConverters._ + +import org.apache.flink.configuration.Configuration /** * A forward chaining implementation of the OWL Horst entailment regime. @@ -19,49 +27,93 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule private val logger = com.typesafe.scalalogging.Logger(LoggerFactory.getLogger(this.getClass.getName)) + val tripleKeyFct : Triple => Int = {t => t.hashCode()} + val nodePairKeyFct : ((Node, Node)) => Int = {case (n1, n2) => n1.hashCode() * 17 + n2.hashCode() * 31} + val nodePairKVKeyFct : (((Node, Node), Node)) => Int = {case (k, v) => k._1.hashCode() * 17 + k._2.hashCode() * 31} + val nodePairKVKeyFct2 : (((Node, Node), Nil.type )) => Int = {case (k, v) => k._1.hashCode() * 17 + k._2.hashCode() * 31} + val fct1: (((Node, Node), Node), ((Node, Node), Nil.type)) => (Node, Node) = {case (l: ((Node, Node), Node), r: ((Node, Node), Nil.type)) => (l._2, r._1._1)} + val fct2: (((Node, Node), Node), ((Node, Node), Node)) => (Node, Node) = {case (l: ((Node, Node), Node), r: ((Node, Node), Node)) => (l._2, r._1._1)} + def apply(graph: RDFGraph): RDFGraph = { logger.info("materializing graph...") val startTime = System.currentTimeMillis() - val triplesRDD = graph.triples + // all the triples in the graph + val triplesDS = graph.triples + + val schemaPredicates = Set( + RDFS.subClassOf, RDFS.subPropertyOf, RDFS.domain, RDFS.range, + OWL2.equivalentClass, OWL2.equivalentProperty, OWL2.allValuesFrom, OWL2.someValuesFrom, OWL2.inverseOf, OWL2.onProperty, + OWL2.hasValue, OWL2.cardinality, OWL2.minCardinality, OWL2.maxCardinality, OWL2.maxQualifiedCardinality, + OWL2.complementOf, OWL2.unionOf, OWL2.intersectionOf + ).map(_.asNode()) + + // split the triples into schema and instance data + // (use a closure variable, but could also used broadcast variable here) +// val schemaPredicatesDS = env.fromCollection(schemaPredicates) +// val BC = "schemaPredicates" + val schemaTriplesDS = triplesDS + .filter(t => schemaPredicates.contains(t.getPredicate)) +// .filter(new RichFilterFunction[Triple] { +// var schemaPredicates: Set[Node] = _ +// override def open(config: Configuration): Unit = { +// schemaPredicates = getRuntimeContext.getBroadcastVariable[Node](BC).asScala.toSet +// } +// override def filter(t: Triple): Boolean = schemaPredicates.contains(t.getPredicate) +// }) +// .withBroadcastSet(schemaPredicatesDS, BC) + .name("schema triples") + + val instanceTriplesDS = triplesDS + .filter(t => !schemaPredicates.contains(t.getPredicate)) +// .filter(new RichFilterFunction[Triple] { +// var schemaPredicates: Set[Node] = _ +// override def open(config: Configuration): Unit = { +// schemaPredicates = getRuntimeContext.getBroadcastVariable[Node](BC).asScala.toSet +// } +// override def filter(t: Triple): Boolean = !schemaPredicates.contains(t.getPredicate) +// }) +// .withBroadcastSet(schemaPredicatesDS, BC) + .name("instance data triples") // extract the schema data - var subClassOfTriples = extractTriples(triplesRDD, RDFS.subClassOf.asNode()) // rdfs:subClassOf - var subPropertyOfTriples = extractTriples(triplesRDD, RDFS.subPropertyOf.asNode()) // rdfs:subPropertyOf - val domainTriples = extractTriples(triplesRDD, RDFS.domain.asNode()) // rdfs:domain - val rangeTriples = extractTriples(triplesRDD, RDFS.range.asNode()) // rdfs:range - val equivClassTriples = extractTriples(triplesRDD, OWL2.equivalentClass.asNode()) // owl:equivalentClass - val equivPropertyTriples = extractTriples(triplesRDD, OWL2.equivalentProperty.asNode()) // owl:equivalentProperty + var subClassOfTriplesDS = extractTriples(schemaTriplesDS, RDFS.subClassOf.asNode()) // rdfs:subClassOf + var subPropertyOfTriplesDS = extractTriples(schemaTriplesDS, RDFS.subPropertyOf.asNode()) // rdfs:subPropertyOf + val domainTriplesDS = extractTriples(schemaTriplesDS, RDFS.domain.asNode()) // rdfs:domain + val rangeTriplesDS = extractTriples(schemaTriplesDS, RDFS.range.asNode()) // rdfs:range + val equivClassTriplesDS = extractTriples(schemaTriplesDS, OWL2.equivalentClass.asNode()) // owl:equivalentClass + val equivPropertyTriplesDS = extractTriples(schemaTriplesDS, OWL2.equivalentProperty.asNode()) // owl:equivalentProperty // 1. we have to process owl:equivalentClass and owl:equivalentProperty before computing the transitive closure // rdfp12a: (?C owl:equivalentClass ?D) -> (?C rdfs:subClassOf ?D ) - val tmp_12a = equivClassTriples.map(t => Triple.create(t.getSubject, RDFS.subClassOf.asNode(), t.getObject)) + val tmp_12a = equivClassTriplesDS.map(t => Triple.create(t.getSubject, RDFS.subClassOf.asNode(), t.getObject)) // rdfp12b: (?C owl:equivalentClass ?D) -> (?D rdfs:subClassOf ?C ) - val tmp_12b = equivClassTriples.map(t => Triple.create(t.getObject, RDFS.subClassOf.asNode(), t.getSubject)) - subClassOfTriples = env.union(Seq(subClassOfTriples, tmp_12a, tmp_12b)) - .distinct() + val tmp_12b = equivClassTriplesDS.map(t => Triple.create(t.getObject, RDFS.subClassOf.asNode(), t.getSubject)) + subClassOfTriplesDS = env.union(Seq(subClassOfTriplesDS, tmp_12a, tmp_12b)) + .distinct(tripleKeyFct) // rdfp13a: (?C owl:equivalentProperty ?D) -> (?C rdfs:subPropertyOf ?D ) - val tmp_13a = equivPropertyTriples.map(t => Triple.create(t.getSubject, RDFS.subPropertyOf.asNode(), t.getObject)) + val tmp_13a = equivPropertyTriplesDS.map(t => Triple.create(t.getSubject, RDFS.subPropertyOf.asNode(), t.getObject)) // rdfp13b: (?C owl:equivalentProperty ?D) -> (?D rdfs:subPropertyOf ?C ) - val tmp_13b = equivPropertyTriples.map(t => Triple.create(t.getObject, RDFS.subPropertyOf.asNode(), t.getSubject)) - subPropertyOfTriples = env.union(Seq(subPropertyOfTriples, tmp_13a, tmp_13b)) - .distinct() + val tmp_13b = equivPropertyTriplesDS.map(t => Triple.create(t.getObject, RDFS.subPropertyOf.asNode(), t.getSubject)) + subPropertyOfTriplesDS = env.union(Seq(subPropertyOfTriplesDS, tmp_13a, tmp_13b)) + .distinct(tripleKeyFct) // 2. we compute the transitive closure of rdfs:subPropertyOf and rdfs:subClassOf // rdfs11: (xxx rdfs:subClassOf yyy), (yyy rdfs:subClassOf zzz) -> (xxx rdfs:subClassOf zzz) - val subClassOfTriplesTrans = computeTransitiveClosure(subClassOfTriples) + val subClassOfTriplesTransDS = computeTransitiveClosure(subClassOfTriplesDS).name("TC subClassOf") + // rdfs5: (xxx rdfs:subPropertyOf yyy), (yyy rdfs:subPropertyOf zzz) -> (xxx rdfs:subPropertyOf zzz) - val subPropertyOfTriplesTrans = computeTransitiveClosure(subPropertyOfTriples) + val subPropertyOfTriplesTransDS = computeTransitiveClosure(subPropertyOfTriplesDS).name("TC subPropertyOf") // we put all into maps which should be more efficient later on - val subClassOfMap = CollectionUtils.toMultiMap(subClassOfTriplesTrans.map(t => (t.getSubject, t.getObject)).collect) - val subPropertyMap = CollectionUtils.toMultiMap(subPropertyOfTriplesTrans.map(t => (t.getSubject, t.getObject)).collect) - val domainMap = domainTriples.map(t => (t.getSubject, t.getObject)).collect.toMap - val rangeMap = rangeTriples.map(t => (t.getSubject, t.getObject)).collect.toMap + val subClassOfMap = CollectionUtils.toMultiMap(subClassOfTriplesTransDS.map(t => (t.getSubject, t.getObject)).collect) + val subPropertyMap = CollectionUtils.toMultiMap(subPropertyOfTriplesTransDS.map(t => (t.getSubject, t.getObject)).collect) + val domainMap = domainTriplesDS.map(t => (t.getSubject, t.getObject)).collect.toMap + val rangeMap = rangeTriplesDS.map(t => (t.getSubject, t.getObject)).collect.toMap // TODO broadcast schema in with Flink // // distribute the schema data structures by means of shared variables @@ -73,40 +125,40 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // compute the equivalence of classes and properties // rdfp12c: (?C rdfs:subClassOf ?D ), (?D rdfs:subClassOf ?C ) -> (?C owl:equivalentClass ?D) - val equivClassTriplesInf = equivClassTriples.union( - subClassOfTriplesTrans + val equivClassTriplesInf = equivClassTriplesDS.union( + subClassOfTriplesTransDS .filter(t => subClassOfMap.getOrElse(t.getObject, Set.empty).contains(t.getSubject)) .map(t => Triple.create(t.getSubject, OWL2.equivalentClass.asNode(), t.getObject)) - ) + ).name("rdfp12c") // rdfp13c: (?C rdfs:subPropertyOf ?D ), (?D rdfs:subPropertyOf ?C ) -> (?C owl:equivalentProperty ?D) - val equivPropTriplesInf = equivPropertyTriples.union( - subPropertyOfTriplesTrans + val equivPropTriplesInf = equivPropertyTriplesDS.union( + subPropertyOfTriplesTransDS .filter(t => subPropertyMap.getOrElse(t.getObject, Set.empty).contains(t.getSubject)) .map(t => Triple.create(t.getSubject, OWL2.equivalentProperty.asNode(), t.getObject)) - ) + ).name("rdfp13c") // we also extract properties with certain OWL characteristic and share them val transitiveProperties = - extractTriples(triplesRDD, None, None, Some(OWL2.TransitiveProperty.asNode())) + extractTriples(triplesDS, None, None, Some(OWL2.TransitiveProperty.asNode())).name("transitive property triples") .map(triple => triple.getSubject) .collect() val functionalProperties = - extractTriples(triplesRDD, None, None, Some(OWL2.FunctionalProperty.asNode())) + extractTriples(triplesDS, None, None, Some(OWL2.FunctionalProperty.asNode())).name("functional property triples") .map(triple => triple.getSubject) .collect() val inverseFunctionalProperties = - extractTriples(triplesRDD, None, None, Some(OWL2.InverseFunctionalProperty.asNode())) + extractTriples(triplesDS, None, None, Some(OWL2.InverseFunctionalProperty.asNode())).name("inverse functional property triples") .map(triple => triple.getSubject) .collect() val symmetricProperties = - extractTriples(triplesRDD, None, None, Some(OWL2.SymmetricProperty.asNode())) + extractTriples(triplesDS, None, None, Some(OWL2.SymmetricProperty.asNode())).name("symmetric property triples") .map(triple => triple.getSubject) .collect() // and inverse property definitions val inverseOfMap = - extractTriples(triplesRDD, None, Some(OWL2.inverseOf.asNode()), None) + extractTriples(schemaTriplesDS, OWL2.inverseOf.asNode()) .map(triple => (triple.getSubject, triple.getObject)) .collect() .toMap @@ -115,28 +167,26 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // and more OWL vocabulary used in property restrictions // owl:someValuesFrom val someValuesFromMap = - extractTriples(triplesRDD, None, Some(OWL2.someValuesFrom.asNode()), None) + extractTriples(schemaTriplesDS, OWL2.someValuesFrom.asNode()) .map(triple => (triple.getSubject, triple.getObject)) .collect() .toMap val someValuesFromMapReversed = someValuesFromMap.map(_.swap) // owl:allValuesFrom val allValuesFromMap = - extractTriples(triplesRDD, None, Some(OWL2.allValuesFrom.asNode()), None) + extractTriples(schemaTriplesDS, OWL2.allValuesFrom.asNode()) .map(triple => (triple.getSubject, triple.getObject)) .collect() .toMap - val allValuesFromMapReversed = allValuesFromMap.map(_.swap) // owl:hasValue val hasValueMap = - extractTriples(triplesRDD, None, Some(OWL2.hasValue.asNode()), None) + extractTriples(schemaTriplesDS, OWL2.hasValue.asNode()) .map(triple => (triple.getSubject, triple.getObject)) .collect() .toMap - val hasValueMapReversed = hasValueMap.groupBy(_._2).mapValues(_.keys).map(identity) // owl:onProperty val onPropertyMap = - extractTriples(triplesRDD, None, Some(OWL2.onProperty.asNode()), None) + extractTriples(triplesDS, OWL2.onProperty.asNode()) .map(triple => (triple.getSubject, triple.getObject)) .collect() .toMap @@ -144,10 +194,10 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // owl:sameAs is computed separately, thus, we split the data - var triplesFiltered = triplesRDD.filter(triple => !triple.predicateMatches(OWL2.sameAs.asNode()) - && !triple.predicateMatches(RDF.`type`.asNode())) - var sameAsTriples = triplesRDD.filter(triple => triple.predicateMatches(OWL2.sameAs.asNode())) - var typeTriples = triplesRDD.filter(triple => triple.predicateMatches(RDF.`type`.asNode())) + var triplesFiltered = instanceTriplesDS.filter(t => !t.predicateMatches(OWL2.sameAs.asNode()) + && !t.predicateMatches(RDF.`type`.asNode())) + var sameAsTriples = extractTriples(instanceTriplesDS, OWL2.sameAs.asNode()) + var typeTriples = extractTriples(instanceTriplesDS, RDF.`type`.asNode()) // println("input rdf:type triples:\n" + typeTriples.collect().mkString("\n")) @@ -168,6 +218,7 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule triplesFiltered .filter(t => subPropertyMap.contains(t.getPredicate)) .flatMap(t => subPropertyMap(t.getPredicate).map(supProp => Triple.create(t.getSubject, supProp, t.getObject))) + .name("rdfs7") // add the inferred triples to the existing triples val rdfs7Res = triplesRDFS7.union(triplesFiltered) @@ -182,6 +233,7 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule rdfs7Res .filter(t => domainMap.contains(t.getPredicate)) .map(t => Triple.create(t.getSubject, RDF.`type`.asNode(), domainMap(t.getPredicate))) + .name("rdfs2") /* rdfs3 aaa rdfs:range xxx . @@ -191,7 +243,7 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule rdfs7Res .filter(t => rangeMap.contains(t.getPredicate)) .map(t => Triple.create(t.getObject, RDF.`type`.asNode(), rangeMap(t.getPredicate))) - + .name("rdfs3") // 4. SubClass inheritance according to rdfs9 // input are the rdf:type triples from RDFS2/RDFS3 and the ones contained in the original graph @@ -206,17 +258,26 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule .union(typeTriples) .filter(t => subClassOfMap.contains(t.getObject)) // such that A has a super class B .flatMap(t => subClassOfMap(t.getObject).map(supCls => Triple.create(t.getSubject, RDF.`type`.asNode(), supCls))) // create triple (s a B) - + .name("rdfs9") // rdfp14b: (?R owl:hasValue ?V),(?R owl:onProperty ?P),(?X rdf:type ?R ) -> (?X ?P ?V ) val rdfp14b = typeTriples - .filter(triple => - hasValueMap.contains(triple.getObject) && - onPropertyMap.contains(triple.getObject) - ) - .map(triple => - Triple.create(triple.getSubject, onPropertyMap(triple.getObject), hasValueMap(triple.getObject)) - ) + .flatMap {(t, out: Collector[Triple]) => + if (hasValueMap.contains(t.getObject) && onPropertyMap.contains(t.getObject)) { + out.collect(Triple.create(t.getSubject, onPropertyMap(t.getObject), hasValueMap(t.getObject))) + } + } +// .filter(triple => +// hasValueMap.contains(triple.getObject) && +// onPropertyMap.contains(triple.getObject) +// ) +// .map(triple => +// Triple.create(triple.getSubject, onPropertyMap(triple.getObject), hasValueMap(triple.getObject)) +// ) + .name("rdfp14b") + logger.whenDebugEnabled { + println("rdfs14b:\n" + rdfp14b.collect().mkString("\n")) + } // rdfp14a: (?R owl:hasValue ?V), (?R owl:onProperty ?P), (?U ?P ?V) -> (?U rdf:type ?R) // println(rdfs7Res.collect().mkString("\n")) @@ -252,22 +313,31 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule Triple.create(s, p, o) } ) - println(rdfp14a.collect().mkString("\n")) + .name("rdfp14a") + logger.whenDebugEnabled { + println("rdf14a:\n" + rdfp14a.collect().mkString("\n")) + } + // rdfp8a: (?P owl:inverseOf ?Q), (?X ?P ?Y) -> (?Y ?Q ?X) val rdfp8a = triplesFiltered .filter(triple => inverseOfMap.contains(triple.getPredicate)) .map(triple => Triple.create(triple.getObject, inverseOfMap(triple.getPredicate), triple.getSubject)) + .name("rdfp8a") // rdfp8b: (?P owl:inverseOf ?Q), (?X ?Q ?Y) -> (?Y ?P ?X) val rdfp8b = triplesFiltered .filter(triple => inverseOfMapReverted.contains(triple.getPredicate)) .map(triple => Triple.create(triple.getObject, inverseOfMapReverted(triple.getPredicate), triple.getSubject)) + .name("rdfp8b") // rdfp3: (?P rdf:type owl:SymmetricProperty), (?X ?P ?Y) -> (?Y ?P ?X) val rdfp3 = triplesFiltered .filter(triple => symmetricProperties.contains(triple.getPredicate)) .map(triple => Triple.create(triple.getObject, triple.getPredicate, triple.getSubject)) + .name("rdfp3") + + import org.apache.flink.api.scala._ // rdfp15: (?R owl:someValuesFrom ?D), (?R owl:onProperty ?P), (?X ?P ?A), (?A rdf:type ?D ) -> (?X rdf:type ?R ) val rdfp15_1 = triplesFiltered @@ -278,16 +348,28 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule }) // .flatMap(identity) + implicit val typeInfo = TypeInformation.of(classOf[((Node, Node), Nil.type)]) val rdfp15_2 = typeTriples .filter(triple => someValuesFromMapReversed.contains(triple.getObject)) - .map(triple => ((someValuesFromMapReversed(triple.getObject), triple.getSubject), "s")) // -> ((?R, ?A), NIL) + .map(triple => ((someValuesFromMapReversed(triple.getObject), triple.getSubject), Nil)) // -> ((?R, ?A), NIL) + + +// val tmp1: DataSet[(Node, Node)] = rdfp15_1.join(rdfp15_2).where(nodePairKVKeyFct).equalTo(nodePairKVKeyFct2) {case (l1, r2) => (l1, r2)} + + implicit val keyInfo: TypeInformation[Int] = createTypeInformation[Int] val rdfp15 = rdfp15_1 - .join(rdfp15_2).where(0).equalTo(0)({// ((?R, ?A), ?X) x ((?R, ?A), NIL) - (l, r) => (l._2, r._1._1) - }) + .join(rdfp15_2).where(nodePairKVKeyFct).equalTo(nodePairKVKeyFct2) +// ( +// new JoinFunction[((Node, Node), Node), ((Node, Node), Nil.type), (Node, Node)] { +// override def join(l: ((Node, Node), Node), r: ((Node, Node), Nil.type)): (Node, Node) = { +// (l._2, r._1._1) +// } +// ) + .apply(fct1)// ((?R, ?A), ?X) x ((?R, ?A), NIL) .map(e => Triple.create(e._1, RDF.`type`.asNode(), e._2)) // -> (?X rdf:type ?R ) + .name("rdfp15") // println(rdfp15.collect().mkString("\n")) @@ -299,36 +381,46 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule val restrictions = onPropertyMapReversed(triple.getPredicate) restrictions.map(_r => (triple.getSubject -> _r, triple.getObject)) // -> ((?X, ?R), ?Y) }) + .name("rdfp16_1") // .flatMap(identity) - -// println("rdfp16_1:\n" + rdfp16_1.collect().mkString("\n")) + logger.whenDebugEnabled { + println("rdfp16_1:\n" + rdfp16_1.collect().mkString("\n")) + } val rdfp16_2 = typeTriples // (?X rdf:type ?R ) .filter(triple => allValuesFromMap.contains(triple.getObject) && onPropertyMap.contains(triple.getObject)) // (?R owl:allValuesFrom ?D), (?R owl:onProperty ?P) .map(triple => ((triple.getSubject, triple.getObject), allValuesFromMap(triple.getObject))) // -> ((?X, ?R), ?D) + .name("rdfp16_2") + logger.whenDebugEnabled { + println("rdfp16_2:\n" + rdfp16_2.collect().mkString("\n")) + } -// println("rdfp16_2:\n" + rdfp16_2.collect().mkString("\n")) - - val rdfp16 = rdfp16_1 - .join(rdfp16_2).where(0).equalTo(0) ({// ((?X, ?R), ?Y) x ((?X, ?R), ?D) - (l, r) => (l._2, r._2) // -> (Y, D) - }) - .map(e => Triple.create(e._1, RDF.`type`.asNode(), e._2)) // -> (?Y rdf:type ?D ) +// import org.apache.flink.api.scala._ + val rdfp16 = rdfp16_1.join(rdfp16_2).where(nodePairKVKeyFct).equalTo(nodePairKVKeyFct)(keyInfo) {// ((?X, ?R), ?Y) x ((?X, ?R), ?D) +// (l: ((Node, Node), Node), r: ((Node, Node), Node)) => (l._2, r._2)} // -> (Y, D) +// {case (l: ((Node, Node), Node), r: ((Node, Node), Node)) => (l._2, r._2)} // -> (Y, D) + (l, r) => Triple.create(l._2, RDF.`type`.asNode(), r._2)} // -> (Y, D) +// .map(e => Triple.create(e._1, RDF.`type`.asNode(), e._2)) // -> (?Y rdf:type ?D ) + .name("rdfp16") // println(rdfp15.collect().mkString("\n")) // deduplicate - val triplesNew = env.union(Seq(triplesRDFS7, rdfp3, rdfp8a, rdfp8b, rdfp14b)) - .distinct() -// .subtract(triplesFiltered, parallelism) TODO subtract in Flink??? + import net.sansa_stack.inference.flink.utils.DataSetUtils._ + val instanceTriplesNew = env.union(Seq(triplesRDFS7, rdfp3, rdfp8a, rdfp8b, rdfp14b)) + .distinct(tripleKeyFct) + .subtract(triplesFiltered, tripleKeyFct, tripleKeyFct) - val tripleNewCnt = triplesNew.count + val instanceTriplesNewCnt = instanceTriplesNew.count + logger.debug("new spo triples:" + instanceTriplesNewCnt) - if(iteration == 1 || tripleNewCnt > 0) { + // transitivity rule has to be applied always in the first iteration or when new triples have been inferred + if(iteration == 1 || instanceTriplesNewCnt > 0) { // add triples - triplesFiltered = triplesFiltered.union(triplesNew) + triplesFiltered = triplesFiltered.union(instanceTriplesNew) // rdfp4: (?P rdf:type owl:TransitiveProperty), (?X ?P ?Y), (?Y ?P ?Z) -> (?X ?P ?Z) - val rdfp4 = computeTransitiveClosure(triplesFiltered.filter(triple => transitiveProperties.contains(triple.getPredicate))) + val rdfp4 = computeTransitiveClosure(triplesFiltered.filter(t => transitiveProperties.contains(t.getPredicate))) + .name("rdfp4") // add triples triplesFiltered = triplesFiltered.union(rdfp4) @@ -336,17 +428,18 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // deduplicate the computed rdf:type triples and check if new triples have been computed val typeTriplesNew = env.union(Seq(triplesRDFS2, triplesRDFS3, triplesRDFS9, rdfp14a, rdfp15, rdfp16)) - .distinct() -// .subtract(typeTriples, parallelism) TODO subtract in Flink??? + .distinct(_.hashCode()) + .subtract(typeTriples, tripleKeyFct, tripleKeyFct) val typeTriplesNewCnt = typeTriplesNew.count + logger.debug("new type triples:" + typeTriplesNewCnt) if(typeTriplesNewCnt > 0) { // add type triples typeTriples = typeTriples.union(typeTriplesNew) } - newDataInferred = typeTriplesNewCnt > 0 || typeTriplesNewCnt > 0 + newDataInferred = instanceTriplesNewCnt > 0 || typeTriplesNewCnt > 0 } // compute the owl:sameAs triples @@ -358,24 +451,40 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule .map(triple => (triple.getSubject, triple.getPredicate) -> triple.getObject) // -> ((?A, ?P), ?B) // println(rdfp1_1.collect().mkString("\n")) // println("Joined:" + rdfp1_1.join(rdfp1_1).collect().mkString("\n")) - // apply self join - val rdfp1 = rdfp1_1 - .join(rdfp1_1).where(0).equalTo(0) ({// ((?A, ?P), ?B) x ((?A, ?P), ?C) - (l, r) => (l._2, r._2) // -> (?B, ?C) - }) - .filter(e => e._1 != e._2) // notEqual(?B ?C) - .map(e => Triple.create(e._1, OWL2.sameAs.asNode(), e._2)) // -> (?B owl:sameAs ?C) + + // apply a) self join or b) groupBy + reduce + val rdfp1 = rdfp1_1.groupBy(nodePairKVKeyFct).reduceGroup(new GroupReduceFunction[((Node, Node), Node), Triple] { + override def reduce(values: Iterable[((Node, Node), Node)], out: Collector[Triple]): Unit = { + val nodes = values.asScala map { e => e._2 } + val pairs = for (x <- nodes; y <- nodes; if x != y) yield (x, y) + pairs.toIterator map {case(n1, n2) => Triple.create(n1, OWL2.sameAs.asNode(), n2)} foreach out.collect + } + }) + .name("rdfp1") +// val rdfp1 = rdfp1_1.join(rdfp1_1).where(nodePairKVKeyFct).equalTo(nodePairKVKeyFct) ({// ((?A, ?P), ?B) x ((?A, ?P), ?C) +// (l, r) => (l._2, r._2) // -> (?B, ?C) +// }) +// .filter(e => e._1 != e._2) // notEqual(?B ?C) +// .map(e => Triple.create(e._1, OWL2.sameAs.asNode(), e._2)) // -> (?B owl:sameAs ?C) // rdfp2: (?P rdf:type owl:InverseFunctionalProperty), (?A ?P ?B), (?C ?P ?B), notEqual(?A ?C) -> (?A owl:sameAs ?C) val rdfp2_1 = triplesFiltered .filter(triple => inverseFunctionalProperties.contains(triple.getPredicate)) .map(triple => (triple.getObject, triple.getPredicate) -> triple.getSubject) // -> ((?B, ?P), ?A) - val rdfp2 = rdfp2_1 - .join(rdfp2_1).where(0).equalTo(0) ({// ((?B, ?P), ?A) x ((?B, ?P), ?C) - (l, r) => (l._2, r._2) // -> (?A, ?C) - }) - .filter(e => e._1 != e._2) // notEqual(?A ?C) - .map(e => Triple.create(e._1, OWL2.sameAs.asNode(), e._2)) // -> (?A owl:sameAs ?C) + val rdfp2 = rdfp2_1.groupBy(nodePairKVKeyFct).reduceGroup(new GroupReduceFunction[((Node, Node), Node), Triple] { + override def reduce(values: Iterable[((Node, Node), Node)], out: Collector[Triple]): Unit = { + val nodes = values.asScala map { e => e._2 } + val pairs = for (x <- nodes; y <- nodes; if x != y) yield (x, y) + pairs.toIterator map {case(n1, n2) => Triple.create(n1, OWL2.sameAs.asNode(), n2)} foreach out.collect + } + }) + .name("rdfp2") + +// val rdfp2 = .join(rdfp2_1).where(0).equalTo(0) ({// ((?B, ?P), ?A) x ((?B, ?P), ?C) +// (l, r) => (l._2, r._2) // -> (?A, ?C) +// }) +// .filter(e => e._1 != e._2) // notEqual(?A ?C) +// .map(e => Triple.create(e._1, OWL2.sameAs.asNode(), e._2)) // -> (?A owl:sameAs ?C) triplesFiltered = triplesFiltered.union(rdfp1).union(rdfp2) @@ -391,19 +500,20 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule Seq( triplesFiltered, typeTriples, - subClassOfTriplesTrans, - subPropertyOfTriplesTrans, + subClassOfTriplesTransDS, + subPropertyOfTriplesTransDS, equivClassTriplesInf, equivPropTriplesInf ) ) + .name("inferred triples") // return graph with inferred triples RDFGraph(inferredTriples) } def deduplicate(triples: DataSet[Triple]): DataSet[Triple] = { - triples.distinct() + triples.distinct(tripleKeyFct) } // rdfp15: (?R owl:someValuesFrom ?D), (?R owl:onProperty ?P), (?X ?P ?A), (?A rdf:type ?D ) -> (?X rdf:type ?R ) diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala index e17eeae..aca22a1 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala @@ -57,7 +57,7 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas extractTriples(schemaTriples, RDFS.subClassOf.asNode()) .name("rdfs:subClassOf") // extract rdfs:subClassOf triples val subClassOfTriplesTrans = - computeTransitiveClosureOptSemiNaive(subClassOfTriples).name("rdfs11") + computeTransitiveClosureOptSemiNaive(subClassOfTriples, RDFS.subClassOf.asNode()).name("rdfs11") /* rdfs5 xxx rdfs:subPropertyOf yyy . @@ -67,11 +67,11 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas extractTriples(schemaTriples, RDFS.subPropertyOf.asNode()) .name("rdfs:subPropertyOf") // extract rdfs:subPropertyOf triples val subPropertyOfTriplesTrans = - computeTransitiveClosureOptSemiNaive(subPropertyOfTriples).name("rdfs5") + computeTransitiveClosureOptSemiNaive(subPropertyOfTriples, RDFS.subPropertyOf.asNode()).name("rdfs5") // split by rdf:type val split = triplesDS.partitionBy(t => t.predicateMatches(RDF.`type`.asNode())) - var typeTriples = split._1 + var typeTriples = split._1.name("rdf:type triples") var otherTriples = split._2 // 2. SubPropertyOf inheritance according to rdfs7 is computed @@ -212,10 +212,10 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas }.name("rdfs3") // rdfs2 and rdfs3 generated rdf:type triples which we'll add to the existing ones - val triples23 = triplesRDFS2.union(triplesRDFS3) + val triples23 = triplesRDFS2.union(triplesRDFS3).name("inferred rdf:type triples") // all rdf:type triples here as intermediate result - typeTriples = typeTriples.union(triples23) + typeTriples = typeTriples.union(triples23).name("all rdf:type triples") // 4. SubClass inheritance according to rdfs9 diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/TransitiveReasoner.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/TransitiveReasoner.scala index 23498ec..9ce67ec 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/TransitiveReasoner.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/TransitiveReasoner.scala @@ -2,6 +2,7 @@ package net.sansa_stack.inference.flink.forwardchaining import scala.reflect.ClassTag +import org.apache.flink.api.common.functions.RichJoinFunction import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.scala.{DataSet, _} import org.apache.flink.util.Collector @@ -18,6 +19,8 @@ import net.sansa_stack.inference.utils.Profiler */ trait TransitiveReasoner extends Profiler{ + val nodeKeyFct = (n: Node) => n.hashCode() + // def computeTransitiveClosure[A, B, C](s: mutable.Set[(A, B, C)]): mutable.Set[(A, B, C)] = { // val t = addTransitive(s) // // recursive call if set changed, otherwise stop and return @@ -55,35 +58,45 @@ trait TransitiveReasoner extends Profiler{ */ def computeTransitiveClosure(triples: DataSet[Triple]): DataSet[Triple] = { if (triples.count() == 0) return triples - log.info("computing TC...") profile { // keep the predicate val predicate = triples.first(1).collect().head.getPredicate + log.info(s"computing TC for property $predicate ...") // compute the TC var subjectObjectPairs = triples.map(t => (t.getSubject, t.getObject)) - // because join() joins on keys, in addition the pairs are stored in reversed order (o, s) - val objectSubjectPairs = subjectObjectPairs.map(t => (t._2, t._1)) - // the join is iterated until a fixed point is reached var i = 1 var oldCount = 0L var nextCount = triples.count() + do { log.info(s"iteration $i...") oldCount = nextCount // perform the join (s1, o1) x (o2, s2), obtaining an DataSet of (s1=o2, (o1, s2)) pairs, // then project the result to obtain the new (s2, o1) paths. +// import org.apache.flink.streaming.api.scala._ +// implicit val typeInfo = TypeInformation.of(classOf[(Node, Node)]) +// val newPairs = subjectObjectPairs +// .join(subjectObjectPairs).where(_._2.hashCode()).equalTo(_._1.hashCode()) {(left, right) => (left.)} +// .map(x => (x._1._1, x._2._2)) +// .filter(tuple => tuple._1 != tuple._2) + implicit val typeInfo = TypeInformation.of(classOf[Int]) subjectObjectPairs = subjectObjectPairs .union( subjectObjectPairs - .join(objectSubjectPairs).where(0).equalTo(0) - .map(x => (x._2._2, x._1._2)) + .join(subjectObjectPairs).where(_._2.hashCode()).equalTo(_._1.hashCode())(typeInfo) + (new RichJoinFunction[(Node, Node), (Node, Node), (Node, Node)] { + + override def join(left: (Node, Node), right: (Node, Node)): (Node, Node) = (left._1, right._2) + }) + .withForwardedFieldsFirst("_1").withForwardedFieldsSecond("_2") +// .map(x => (x._1._1, x._2._2)) .filter(tuple => tuple._1 != tuple._2)// omit (s1, s1) ) - .distinct() + .distinct(pair => pair._1.hashCode() * 17 + pair._2.hashCode() * 31) nextCount = subjectObjectPairs.count() i += 1 } while (nextCount != oldCount) @@ -181,19 +194,31 @@ trait TransitiveReasoner extends Profiler{ /** * Computes the transitive closure on a DataSet of triples. * Note, that the assumption is that all triples do have the same predicate. + * If no predicate is given, we take any triple from the dataset and use its predicate. We highly recommend to + * provide the predicate in order to avoid unnecessary operations. + * * This implementation uses the Flink iterate operator (see * [[https://ci.apache.org/projects/flink/flink-docs-master/dev/batch/iterations.html"]]) * - * @param triples the DataSet of triples - * @return a DataSet containing the transitive closure of the triples + * @param triples the [[DataSet]] of triples + * @param predicate the optional predicate + * @return a [[DataSet]] containing the transitive closure of the triples */ - def computeTransitiveClosureOptSemiNaive(triples: DataSet[Triple]): DataSet[Triple] = { + def computeTransitiveClosureOptSemiNaive(triples: DataSet[Triple], predicate: Node = null): DataSet[Triple] = { + + // if no predicate is given, we take an arbitrary triple + // this also means, we return here if the dataset is empty (couldn't find a isEmpty() function) + val pred = if (predicate != null) predicate else { + val t = triples.first(1).collect() + if(t.nonEmpty) t.head.getPredicate + else return triples + } + // apparently, we have to use pairs for (subject, object) because the Jena Triple is not a Scala tuple // and we have to provide positions of key and value in the iterate method // the initial set of edges is used as input for both, the workset and the solutionset val initialTC = triples.map(t => (NodeKey(t.getSubject), NodeKey(t.getObject))) - val pred = triples.first(1).collect().head.getPredicate log.info("computing TC...") def iterate(s: DataSet[(NodeKey, NodeKey)], ws: DataSet[(NodeKey, NodeKey)]) From 634099a30b207461a6cb267b05edceb1169461c7 Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Fri, 28 Jun 2019 09:30:11 +0200 Subject: [PATCH 21/23] Added missing deps for dist package --- sansa-inference-flink/pom.xml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sansa-inference-flink/pom.xml b/sansa-inference-flink/pom.xml index 701dbca..f947952 100644 --- a/sansa-inference-flink/pom.xml +++ b/sansa-inference-flink/pom.xml @@ -524,12 +524,12 @@ under the License. org.apache.commons:commons-math org.apache.sling:org.apache.sling.commons.json commons-logging:commons-logging - org.apache.httpcomponents:httpclient - org.apache.httpcomponents:httpcore + + commons-codec:commons-codec - com.fasterxml.jackson.core:jackson-core - com.fasterxml.jackson.core:jackson-databind - com.fasterxml.jackson.core:jackson-annotations + + + org.codehaus.jettison:jettison stax:stax-api com.typesafe:config From 2ddb2f91f189c62dddace9cc893e86cd11ca8c26 Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Fri, 28 Jun 2019 09:56:50 +0200 Subject: [PATCH 22/23] Minor --- .../inference/flink/RDFGraphMaterializer.scala | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/RDFGraphMaterializer.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/RDFGraphMaterializer.scala index da699eb..2c12890 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/RDFGraphMaterializer.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/RDFGraphMaterializer.scala @@ -93,15 +93,20 @@ object RDFGraphMaterializer { val inferredGraph = reasoner.apply(graph) // println(s"|G_inf| = ${inferredGraph.size}") +// println(env.getExecutionPlan()) + // write triples to disk RDFGraphWriter.writeToDisk(inferredGraph, output, writeToSingleFile, sortedOutput) - // println(env.getExecutionPlan()) +// println(env.getExecutionPlan()) - val jn = if (jobName.isEmpty) s"${profile} Reasoning" else jobName + val jn = if (jobName.isEmpty) s"$profile Reasoning" else jobName // run the program env.execute(jn) + + + } // the config object @@ -120,7 +125,7 @@ object RDFGraphMaterializer { // the CLI parser val parser = new scopt.OptionParser[Config]("RDFGraphMaterializer") { - head("RDFGraphMaterializer", "0.5.0") + head("RDFGraphMaterializer", "0.6.0") // opt[Seq[File]]('i', "input").required().valueName(",,..."). // action((x, c) => c.copy(in = x)). From b5e03b233e985d508ceeaf9af6d936ad44098f8b Mon Sep 17 00:00:00 2001 From: Lorenz Buehmann Date: Fri, 28 Jun 2019 10:22:39 +0200 Subject: [PATCH 23/23] Scalastyle cleanup --- .../inference/flink/data/RDFGraphLoader.scala | 2 +- .../forwardchaining/ForwardRuleReasoner.scala | 20 ------------------- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala index 7e753fd..afbf0bd 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala @@ -39,7 +39,7 @@ object RDFGraphLoader { val ds = RDFGraphLoader.loadFromDisk(path, env).triples println(s"size:${ds.count}") - println("sample data:\n" + ds.first(10).map { _.toString.replaceAll("[\\x00-\\x1f]","???")}.collect().mkString("\n")) + println("sample data:\n" + ds.first(10).map { _.toString.replaceAll("[\\x00-\\x1f]", "???")}.collect().mkString("\n")) } } diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasoner.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasoner.scala index 50833fc..0104aef 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasoner.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasoner.scala @@ -68,26 +68,6 @@ trait ForwardRuleReasoner extends TransitiveReasoner{ subject: Option[Node], predicate: Option[Node], obj: Option[Node]): DataSet[Triple] = { -// import net.sansa_stack.inference.utils.PredicateUtils._ -// var extractedTriples = triples -// var filter = (t: Triple) => true -// -// if(subject.isDefined) { -// filter = filter || (_.subjectMatches(subject.get)) -//// extractedTriples = extractedTriples.filter(triple => triple.subjectMatches(subject.get)) -// } -// -// if(predicate.isDefined) { -// filter = filter || (_.predicateMatches(predicate.get)) -//// extractedTriples = extractedTriples.filter(triple => triple.predicateMatches(predicate.get)) -// } -// -// if(obj.isDefined) { -// filter = filter || (_.objectMatches(obj.get)) -//// extractedTriples = extractedTriples.filter(triple => triple.objectMatches(obj.get)) -// } -// -// extractedTriples.filter(filter) val filterFct = (t: Triple) => t.subjectMatches(subject.orNull) ||