Skip to content

Commit

Permalink
[NOID] Fixes #4138: Add support for loading Gephi GEXF file format (#…
Browse files Browse the repository at this point in the history
…4171)

* Fixes #4138: Add support for loading Gephi GEXF file format

* removed unused imports

* Fixed RollupTest
  • Loading branch information
vga91 committed Dec 19, 2024
1 parent cd0a12e commit ad13cc7
Show file tree
Hide file tree
Showing 14 changed files with 953 additions and 180 deletions.
52 changes: 41 additions & 11 deletions core/src/main/java/apoc/export/graphml/XmlGraphMLReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
*/
package apoc.export.graphml;


import apoc.export.util.BatchTransaction;
import apoc.export.util.ExportConfig;
import apoc.export.util.Reporter;
Expand Down Expand Up @@ -218,13 +219,37 @@ public Object parseValue(String input) {
public static final QName TYPE = QName.valueOf("attr.type");
public static final QName LIST = QName.valueOf("attr.list");
public static final QName KEY = QName.valueOf("key");
public static final QName VALUE = QName.valueOf("value");
public static final QName DATA_TYPE = QName.valueOf("type");
public static final QName KIND = QName.valueOf("kind");

public XmlGraphMLReader(GraphDatabaseService db, Transaction tx) {
this.db = db;
this.tx = tx;
}

public enum ReaderType {
GRAPHML("attvalue", KEY, LABEL),
GEXF("data", FOR, KIND);

public String attvalue;
public QName key;
public QName label;

ReaderType(String attvalue, QName key, QName label) {
this.attvalue = attvalue;
this.key = key;
this.label = label;
}
}

public long parseXML(Reader input, TerminationGuard terminationGuard) throws XMLStreamException {
return parseXML(input, terminationGuard, ReaderType.GRAPHML);
}

public long parseXML(Reader input, TerminationGuard terminationGuard, ReaderType readerType)
throws XMLStreamException {
Map<String, Object> dataMap = new HashMap<>();
Map<String, Long> cache = new HashMap<>(1024 * 32);
XMLInputFactory inputFactory = XMLInputFactory.newInstance();
inputFactory.setProperty("javax.xml.stream.isCoalescing", true);
Expand All @@ -238,7 +263,6 @@ public long parseXML(Reader input, TerminationGuard terminationGuard) throws XML
int count = 0;
BatchTransaction tx = new BatchTransaction(db, batchSize * 10, reporter);
try {

while (reader.hasNext()) {
terminationGuard.check();
XMLEvent event;
Expand All @@ -257,11 +281,14 @@ public long parseXML(Reader input, TerminationGuard terminationGuard) throws XML
continue;
}
if (event.isStartElement()) {

StartElement element = event.asStartElement();
String name = element.getName().getLocalPart();

if (name.equals("graphml") || name.equals("graph")) continue;
if (name.equals("graphml") || name.equals("graph") || name.equals("gexf")) continue;
if (name.equals("attribute")) {
String id = getAttribute(element, ID);
String type = getAttribute(element, DATA_TYPE);
dataMap.put(id, type);
}
if (name.equals("key")) {
String id = getAttribute(element, ID);
Key key = new Key(
Expand All @@ -270,7 +297,6 @@ public long parseXML(Reader input, TerminationGuard terminationGuard) throws XML
getAttribute(element, TYPE),
getAttribute(element, LIST),
getAttribute(element, FOR));

XMLEvent next = peek(reader);
if (next.isStartElement()
&& next.asStartElement()
Expand All @@ -284,20 +310,23 @@ public long parseXML(Reader input, TerminationGuard terminationGuard) throws XML
else relKeys.put(id, key);
continue;
}
if (name.equals("data")) {
if (name.equals(readerType.attvalue)) { // Changed from data to attvalue for node properties in gexf
if (last == null) continue;
String id = getAttribute(element, KEY);
String id = getAttribute(element, readerType.key);
boolean isNode = last instanceof Node;
Key key = isNode ? nodeKeys.get(id) : relKeys.get(id);
if (key == null) key = Key.defaultKey(id, isNode);
final Map.Entry<XMLEvent, Object> eventEntry = getDataEventEntry(reader, key);
final XMLEvent next = eventEntry.getKey();
final Object value = eventEntry.getValue();
final Object value = readerType.equals(ReaderType.GRAPHML)
? eventEntry.getValue()
: getAttribute(element, VALUE);
if (value != null) {
if (this.labels && isNode && id.equals("labels")) {
addLabels((Node) last, value.toString());
} else if (!this.labels || isNode || !id.equals("label")) {
last.setProperty(key.name, value);
Object convertedValue = toValidValue(value, key.name, dataMap);
last.setProperty(key.name, convertedValue);
if (reporter != null) reporter.update(0, 0, 1);
}
} else if (next.getEventType() == XMLStreamConstants.END_ELEMENT) {
Expand All @@ -311,7 +340,8 @@ public long parseXML(Reader input, TerminationGuard terminationGuard) throws XML
String id = getAttribute(element, ID);
Node node = tx.getTransaction().createNode();
if (this.labels) {
String labels = getAttribute(element, LABELS);
String labels = getAttribute(
element, LABEL); // Changed from labels to label to fit gexf property format
addLabels(node, labels);
}
if (storeNodeIds) node.setProperty("id", id);
Expand All @@ -324,7 +354,7 @@ public long parseXML(Reader input, TerminationGuard terminationGuard) throws XML
}
if (name.equals("edge")) {
tx.increment();
String label = getAttribute(element, LABEL);
String label = getAttribute(element, readerType.label); // changed from label to kind for gexf
Node from = getByNodeId(cache, tx.getTransaction(), element, XmlNodeExport.NodeType.SOURCE);
Node to = getByNodeId(cache, tx.getTransaction(), element, XmlNodeExport.NodeType.TARGET);

Expand Down
4 changes: 2 additions & 2 deletions core/src/main/java/apoc/load/Xml.java
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ public Map<String, Object> parse(
.orElse(null);
}

private Stream<MapResult> xmlXpathToMapResult(
public static Stream<MapResult> xmlXpathToMapResult(
@Name("urlOrBinary") Object urlOrBinary, boolean simpleMode, String path, Map<String, Object> config)
throws Exception {
if (config == null) config = Collections.emptyMap();
Expand All @@ -150,7 +150,7 @@ private Stream<MapResult> xmlXpathToMapResult(
}
}

private Stream<MapResult> parse(InputStream data, boolean simpleMode, String path, boolean failOnError)
public static Stream<MapResult> parse(InputStream data, boolean simpleMode, String path, boolean failOnError)
throws Exception {
List<MapResult> result = new ArrayList<>();
try {
Expand Down
1 change: 1 addition & 0 deletions docs/asciidoc/modules/ROOT/nav.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ include::partial$generated-documentation/nav.adoc[]
** xref::import/load-html.adoc[]
** xref::import/import-csv.adoc[]
** xref::import/import-graphml.adoc[]
** xref::import/gexf.adoc[]
* xref:export/index.adoc[]
** xref::export/web-apis.adoc[]
Expand Down
222 changes: 222 additions & 0 deletions docs/asciidoc/modules/ROOT/pages/import/gexf.adoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
[[gexf]]
= Load GEXF (Graph Exchange XML Format)
:description: This section describes procedures that can be used to import data from GEXF files.



Many existing applications and data integrations use GEXF to describes a graph with nodes and edges.
For further information, you should visit the https://gexf.net/[official documentation].

It is possible to load or import nodes and relationship from a GEXF file with the procedures
`apoc.load.gexf` and `apoc.import.gexf`. You need to:

* provide a path to a GEXF file
* provide configuration (optional)

The `apoc.import.gexf` read as the `apoc.load.gexf` but also create nodes and relationships in Neo4j.

For reading from files you'll have to enable the config option:

----
apoc.import.file.enabled=true
----

By default file paths are global, for paths relative to the `import` directory set:

----
apoc.import.file.use_neo4j_config=true
----

== Examples for apoc.load.gexf

.load.gexf
----
<?xml version="1.0" encoding="UTF-8"?>
<gexf version="1.2">
<graph defaultedgetype="directed">
<nodes>
<node foo="bar">
<attvalues>
<attvalue for="0" value="http://gephi.org"/>
</attvalues>
</node>
</nodes>
</graph>
</gexf>
----

[source, cypher]
----
CALL apoc.load.gexf('load.gexf')
----

.Results
[opts="header"]
|===
| value
| {_type: gexf, _children: [{_type: graph, defaultedgetype: directed, _children: [{_type: nodes, _children: [{_type: node, _children: [{_type: attvalues, _children: [{_type: attvalue, for: 0, value: http://gephi.org}]}], foo: bar}]}]}], version: 1.2}
|===

== Examples for apoc.import.gexf

Besides the file you can pass in a config map:

.Config parameters
[opts=header]
|===
| name | type | default | description
| readLabels | Boolean | false | Creates node labels based on the value in the `labels` property of `node` elements
| defaultRelationshipType | String | RELATED | The default relationship type to use if none is specified in the GraphML file
| storeNodeIds | Boolean | false | store the `id` property of `node` elements
| batchSize | Integer | 20000 | The number of elements to process per transaction
| compression | `Enum[NONE, BYTES, GZIP, BZIP2, DEFLATE, BLOCK_LZ4, FRAMED_SNAPPY]` | `null` | Allow taking binary data, either not compressed (value: `NONE`) or compressed (other values)
| source | Map<String,String> | Empty map | See `source / target config` parameter below
| target | Map<String,String> | Empty map | See `source / target config` parameter below
See the xref::overview/apoc.load/apoc.load.csv.adoc#_binary_file[Binary file example]
|===


With the following file will be created:

* 1 node with label Gephi
* 2 nodes with label Webatlas
* 1 node with label RTGI
* 1 node with label BarabasiLab
* 6 relationships of kind KNOWS
* 1 relationship of kind HAS_TICKET
* 1 relationship of kind BAZ

.data.gexf
----
<?xml version="1.0" encoding="UTF-8"?>
<gexf xmlns="http://gexf.net/1.3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://gexf.net/1.3 http://gexf.net/1.3/gexf.xsd" version="1.2">
<meta lastmodifieddate="2009-03-20">
<creator>Gephi.org</creator>
<description>A Web network</description>
</meta>
<graph defaultedgetype="directed">
<attributes class="node">
<attribute id="0" title="url" type="string"/>
<attribute id="room" title="room" type="integer"/>
<attribute id="projects" title="projects" type="long"/>
<attribute id="price" title="price" type="double"/>
<attribute id="1" title="indegree" type="float"/>
<attribute id="members" title="members" type="liststring"/>
<attribute id="pins" title="pins" type="listboolean"/>
<attribute id="2" title="frog" type="boolean">
<default>true</default>
</attribute>
</attributes>
<attributes class="edge">
<attribute id="score" title="score" type="float"/>
</attributes>
<nodes>
<node id="0" label="Gephi">
<attvalues>
<attvalue for="0" value="http://gephi.org"/>
<attvalue for="1" value="1"/>
<attvalue for="room" value="10"/>
<attvalue for="price" value="10.02"/>
<attvalue for="projects" value="300"/>
<attvalue for="members" value="[Altomare, Sterpeto, Lino]"/>
<attvalue for="pins" value="[true, false, true, false]"/>
</attvalues>
</node>
<node id="5" label="Gephi">
<attvalues>
<attvalue for="0" value="http://test.gephi.org"/>
<attvalue for="1" value="2"/>
</attvalues>
</node>
<node id="1" label="Webatlas">
<attvalues>
<attvalue for="0" value="http://webatlas.fr"/>
<attvalue for="1" value="2"/>
</attvalues>
</node>
<node id="2" label="RTGI">
<attvalues>
<attvalue for="0" value="http://rtgi.fr"/>
<attvalue for="1" value="1"/>
</attvalues>
</node>
<node id="3" label=":BarabasiLab:Webatlas">
<attvalues>
<attvalue for="0" value="http://barabasilab.com"/>
<attvalue for="1" value="1"/>
<attvalue for="2" value="false"/>
</attvalues>
</node>
</nodes>
<edges>
<edge source="0" target="1" kind="KNOWS">
<attvalues>
<attvalue for="score" value="1.5"/>
</attvalues>
</edge>
<edge source="0" target="0" kind="BAZ">
<attvalues>
<attvalue for="foo" value="bar"/>
<attvalue for="score" value="2"/>
</attvalues>
</edge>
<edge source="0" target="2" kind="HAS_TICKET">
<attvalues>
<attvalue for="ajeje" value="brazorf"/>
<attvalue for="score" value="3"/>
</attvalues>
</edge>
<edge source="0" target="2" kind="KNOWS" />
<edge source="1" target="0" kind="KNOWS" />
<edge source="2" target="1" kind="KNOWS" />
<edge source="0" target="3" kind="KNOWS" />
<edge source="5" target="3" kind="KNOWS" />
</edges>
</graph>
</gexf>
----

[source, cypher]
----
CALL apoc.import.gexf('data.gexf', {readLabels:true})
----

.Results
[opts="header"]
|===
| value
| {
"relationships" : 8,
"batches" : 0,
"file" : "file:/../data.gexf",
"nodes" : 5,
"format" : "gexf",
"source" : "file",
"time" : 9736,
"rows" : 0,
"batchSize" : -1,
"done" : true,
"properties" : 21
}
|===

We can also store the node IDs by executing:
[source, cypher]
----
CALL apoc.import.gexf('data.gexf', {readLabels:true, storeNodeIds: true})
----

=== source / target config

Allows the import of relations in case the source and / or target nodes are not present in the file, searching for nodes via a custom label and property.
To do this, we can insert into the config map `source: {label: '<MY_SOURCE_LABEL>', id: `'<MY_SOURCE_ID>'`}` and/or `source: {label: '<MY_TARGET_LABEL>', id: `'<MY_TARGET_ID>'`}`
In this way, we can search start and end nodes via the source and end attribute of `edge` tag.

For example, with a config map `{source: {id: 'myId', label: 'Foo'}, target: {id: 'other', label: 'Bar'}}`
with a edge row like `<edge id="e0" source="n0" target="n1" label="KNOWS"><data key="label">KNOWS</data></edge>`
we search a source node `(:Foo {myId: 'n0'})` and an end node `(:Bar {other: 'n1'})`.
The id key is optional (the default is `'id'`).




3 changes: 2 additions & 1 deletion docs/asciidoc/modules/ROOT/pages/import/index.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ For more information on these procedures, see:
* xref::import/load-xml.adoc[]
* xref::import/load-html.adoc[]
* xref::import/import-csv.adoc[]
* xref::import/import-graphml.adoc[]
* xref::import/import-graphml.adoc[]
* xref::import/gexf.adoc[]
Loading

0 comments on commit ad13cc7

Please sign in to comment.