Goal: I am trying to make a Neo4j instance of the DBLP database on the basis of the publicly available DBLP XML file available here. I have modeled the database as a bipartite graph where the authors are in one set and the publications in the other set. To obtain all coauthors of John Doe one has to make the following Cypher query:
MATCH (a:Author)-[:WROTE]->(publication)<-[:WROTE]-(b:Author) WHERE a.name = "John Doe" RETURN DISTINCT b"
Problem 1: There seems to be a problem partly related to special characters, such as ë, æ, í, etc. When I, in my browser at the address http://localhost:7474/browser/, enter the query "MATCH (a:Author)-[:WROTE]->(p)<-[:WROTE]-(b:Author) WHERE a.name = "Jan Arne Telle" RETURN DISTINCT b", I should get 58 unique results (coauthors), but I get 79 results. For instance, coauthor Daniël Paulusma is split into three results: "Dani", "ë", "l Paulusma". But in fact, I also get coauthor David Keldsen as three results: "David Keldsen", "David", and "Keldsen". So the problem is not only related to special characters.
Problem 2: Results for the above mentioned query were returned in 90697 ms.
EDIT: After making several such queries results are returned in 2000 ms to 4000 ms.
Here is all the code:
Entry point: Application.java:
package std;
import java.io.File;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.Transaction;
import org.neo4j.graphdb.factory.GraphDatabaseFactory;
import org.neo4j.kernel.impl.util.FileUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.data.neo4j.config.EnableNeo4jRepositories;
import org.springframework.data.neo4j.config.Neo4jConfiguration;
import org.springframework.data.neo4j.core.GraphDatabase;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.SAXException;
import org.apache.xerces.util.SecurityManager;
@SpringBootApplication
public class Application implements CommandLineRunner {
@Configuration
@EnableNeo4jRepositories(basePackages = "std")
static class ApplicationConfig extends Neo4jConfiguration {
public ApplicationConfig() {
setBasePackage("std");
}
@Bean
GraphDatabaseService graphDatabaseService() {
return new GraphDatabaseFactory().newEmbeddedDatabase("dblp.db");
}
}
@Autowired
PublicationRepository publicationRepository;
@Autowired
GraphDatabase graphDatabase;
public void run(String... args) throws Exception {
Transaction tx = graphDatabase.beginTx();
try {
SAXParserFactory parserFactory = SAXParserFactory.newInstance();
SAXParser parser = parserFactory.newSAXParser();
SecurityManager mgr = new SecurityManager();
mgr.setEntityExpansionLimit(3100000);
parser.setProperty("http://apache.org/xml/properties/security-manager", mgr);
SaxHandler handler = new SaxHandler(publicationRepository, graphDatabase);
handler.setTransaction(tx);
parser.getXMLReader().setFeature("http://xml.org/sax/features/validation", true);
InputStream xmlInput = new FileInputStream("/Users/username/Documents/dblp.xml");
parser.parse(xmlInput, handler);
tx.success();
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParserConfigurationException e) {
e.printStackTrace();
} finally {
tx.close();
}
}
public static void main(String[] args) throws Exception {
FileUtils.deleteRecursively(new File("dblp.db"));
SpringApplication.run(Application.class, args);
}
}
Author.java:
package std;
import org.springframework.data.neo4j.annotation.GraphId;
import org.springframework.data.neo4j.annotation.Indexed;
import org.springframework.data.neo4j.annotation.NodeEntity;
import org.springframework.data.neo4j.annotation.Query;
import org.springframework.data.neo4j.support.index.IndexType;
@NodeEntity
public class Author {
@GraphId
private Long id;
@Indexed(indexName = "names", unique = true, indexType = IndexType.FULLTEXT)
private String name;
public Author() {
}
public Author(String name) {
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (this.getClass() != obj.getClass())
return false;
Author other = (Author) obj;
if (this.id != null && this.name != null && other.id != null && other.name != null) {
if (this.id.equals(other.id) && this.name.equals(other.name))
return true;
} else {
return true;
}
return false;
}
@Override
public int hashCode() {
return 31 * (this.id == null ? 1 : this.id.hashCode()) + 31 * (this.name == null ? 1 : this.name.hashCode());
}
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}
Publication.java:
package std;
import java.io.Serializable;
import java.util.HashSet;
import java.util.Set;
import org.neo4j.graphdb.Direction;
import org.springframework.data.neo4j.annotation.GraphId;
import org.springframework.data.neo4j.annotation.Indexed;
import org.springframework.data.neo4j.annotation.NodeEntity;
import org.springframework.data.neo4j.annotation.RelatedTo;
import org.springframework.data.neo4j.support.index.IndexType;
@NodeEntity
public class Publication implements Serializable {
private static final long serialVersionUID = -6393545300391560520L;
@GraphId
Long nodeId;
private String type = "";
private String key = "";
private String mdate = "";
private String publtype = "";
private String reviewid = "";
private String rating = "";
@RelatedTo(type = "WROTE", direction = Direction.INCOMING)
private Set<Author> authors = new HashSet<Author>();
private String editor = "";
@Indexed(indexType = IndexType.FULLTEXT, indexName = "titles")
private String title = "";
private String booktitle = "";
private String pages = "";
private String year = "";
private String address = "";
private String journal = "";
private String volume = "";
private String number = "";
private String month = "";
private String url = "";
private String ee = "";
private String cdrom = "";
private String cite = "";
private String publisher = "";
private String note = "";
private String crossref = "";
private String isbn = "";
private String series = "";
private String school = "";
private String chapter = "";
public Publication() {
}
public void addAuthor(Author author) {
authors.add(author);
}
public Set<Author> getAuthors() {
return authors;
}
public void setAuthors(Set<Author> authors) {
this.authors = authors;
}
@Override
public String toString() {
return "TYPE: " + type + "\n"
+ "KEY: " + key + "\n"
+ "MDATE: " + mdate + "\n";
}
public Long getNodeId() {
return nodeId;
}
public void setNodeId(Long nodeId) {
this.nodeId = nodeId;
}
public String getKey() {
return key;
}
public void setKey(String key) {
this.key = key;
}
public String getMdate() {
return mdate;
}
public void setMdate(String mdate) {
this.mdate = mdate;
}
public String getPubltype() {
return publtype;
}
public void setPubltype(String publtype) {
this.publtype = publtype;
}
public String getReviewid() {
return reviewid;
}
public void setReviewid(String reviewid) {
this.reviewid = reviewid;
}
public String getRating() {
return rating;
}
public void setRating(String rating) {
this.rating = rating;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getEditor() {
return editor;
}
public void setEditor(String editor) {
this.editor = editor;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getBooktitle() {
return booktitle;
}
public void setBooktitle(String booktitle) {
this.booktitle = booktitle;
}
public String getPages() {
return pages;
}
public void setPages(String pages) {
this.pages = pages;
}
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public String getAddress() {
return address;
}
public void setAddress(String address) {
this.address = address;
}
public String getJournal() {
return journal;
}
public void setJournal(String journal) {
this.journal = journal;
}
public String getVolume() {
return volume;
}
public void setVolume(String volume) {
this.volume = volume;
}
public String getNumber() {
return number;
}
public void setNumber(String number) {
this.number = number;
}
public String getMonth() {
return month;
}
public void setMonth(String month) {
this.month = month;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getEe() {
return ee;
}
public void setEe(String ee) {
this.ee = ee;
}
public String getCdrom() {
return cdrom;
}
public void setCdrom(String cdrom) {
this.cdrom = cdrom;
}
public String getCite() {
return cite;
}
public void setCite(String cite) {
this.cite = cite;
}
public String getPublisher() {
return publisher;
}
public void setPublisher(String publisher) {
this.publisher = publisher;
}
public String getNote() {
return note;
}
public void setNote(String note) {
this.note = note;
}
public String getCrossref() {
return crossref;
}
public void setCrossref(String crossref) {
this.crossref = crossref;
}
public String getIsbn() {
return isbn;
}
public void setIsbn(String isbn) {
this.isbn = isbn;
}
public String getSeries() {
return series;
}
public void setSeries(String series) {
this.series = series;
}
public String getSchool() {
return school;
}
public void setSchool(String school) {
this.school = school;
}
public String getChapter() {
return chapter;
}
public void setChapter(String chapter) {
this.chapter = chapter;
}
}
PublicationRepository.java:
package std;
import org.springframework.data.neo4j.repository.GraphRepository;
public interface PublicationRepository extends GraphRepository<Publication> {
Publication findByTitle(String title);
}
SaxHandler.java:
package std;
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
import org.neo4j.graphdb.Transaction;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.neo4j.core.GraphDatabase;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class SaxHandler extends DefaultHandler {
private Stack<String> qNameStack = new Stack<String>();
private Stack<Publication> publicationStack = new Stack<Publication>();
private String publicationType = null;
private PublicationRepository publicationRepository = null;
private Publication publication = null;
private Author author = null;
private String currentElement = null;
private String value = null;
private boolean insideTitle = false;
private GraphDatabase graphDatabase;
private Transaction tx = null;
private static int counter = 0;
public List<Publication> getPublications() {
return publications;
}
@Autowired
public SaxHandler(PublicationRepository publicationRepository, GraphDatabase graphDatabase) {
this.publicationRepository = publicationRepository;
this.graphDatabase = graphDatabase;
}
public void setTransaction(Transaction tx) {
this.tx = tx;
}
public void startElement(String uri, String localName, String tagName, Attributes attributes) throws SAXException {
storeTagName(tagName);
createEmptyPublication();
testIfEnteringTitle(tagName);
testIfPublicationTag(tagName);
testOnAttributes(tagName, attributes);
}
public void endElement(String uri, String localName, String tagName) throws SAXException {
testIfLeavingTitle(tagName);
removeNameOfLastVisitedTag();
testIfFinishedCreatingPublication(tagName);
}
public void characters(char ch[], int start, int length) throws SAXException {
storeContentsInCurrentPublication(ch, start, length);
}
/**
* Store the contents of the current tag in the corresponding field
* of the current publication.
*
* @param ch
* @param start
* @param length
*/
private void storeContentsInCurrentPublication(char ch[], int start, int length) {
value = new String(ch,start,length).trim();
if (value.length() == 0)
return;
publication = publicationStack.peek();
currentElement = qNameStack.peek();
if ("author".equals(currentElement)) {
author = new Author();
author.setName(value);
publication.addAuthor(author);
} else if ("editor".equals(currentElement)) {
publication.setEditor(value);
} else if ("title".equals(currentElement)) {
String title = publication.getTitle() + value;
publication.setTitle(title);
} else if ("booktitle".equals(currentElement)) {
publication.setBooktitle(value);
} else if ("pages".equals(currentElement)) {
publication.setPages(value);
} else if ("year".equals(currentElement)) {
publication.setYear(value);
} else if ("address".equals(currentElement)) {
publication.setAddress(value);
} else if ("journal".equals(currentElement)) {
publication.setJournal(value);
} else if ("volume".equals(currentElement)) {
publication.setVolume(value);
} else if ("number".equals(currentElement)) {
publication.setNumber(value);
} else if ("month".equals(currentElement)) {
publication.setMonth(value);
} else if ("url".equals(currentElement)) {
publication.setUrl(value);
} else if ("ee".equals(currentElement)) {
publication.setEe(value);
} else if ("cdrom".equals(currentElement)) {
publication.setCdrom(value);
} else if ("cite".equals(currentElement)) {
publication.setCite(value);
} else if ("publisher".equals(currentElement)) {
publication.setPublisher(value);
} else if ("note".equals(currentElement)) {
publication.setNote(value);
} else if ("crossref".equals(currentElement)) {
publication.setCrossref(value);
} else if ("isbn".equals(currentElement)) {
publication.setIsbn(value);
} else if ("series".equals(currentElement)) {
publication.setSeries(value);
} else if ("school".equals(currentElement)) {
publication.setSchool(value);
} else if ("chapter".equals(currentElement)) {
publication.setChapter(value);
} else if ("i".equals(currentElement) && isInsideTitleOrBooktitle()) {
String title = publication.getTitle() + "<i>" + value + "</i>";
publication.setTitle(title);
} else if ("sup".equals(currentElement) && isInsideTitleOrBooktitle()) {
String title = publication.getTitle() + "<sup>" + value + "</sup>";
publication.setTitle(title);
} else if ("sub".equals(currentElement) && isInsideTitleOrBooktitle()) {
String title = publication.getTitle() + "<sub>" + value + "</sub>";
publication.setTitle(title);
} else if ("tt".equals(currentElement) && isInsideTitleOrBooktitle()) {
String title = publication.getTitle() + "<tt>" + value + "</tt>";
publication.setTitle(title);
} else if ("ref".equals(currentElement) && isInsideTitleOrBooktitle()) {
String title = publication.getTitle() + "<ref>" + value + "</ref>";
publication.setTitle(title);
}
}
/**
* Returns true if and only if the parser is inside
* either a title or booktitle tag.
*
* @return true if and only if the parser is inside
* either a title or booktitle tag.
*/
private boolean isInsideTitleOrBooktitle() {
return insideTitle;
}
/**
* Checks if the parser is finished with one whole
* publication. If so, the publication is stored in
* the database.
*
* @param tagName
*/
private void testIfFinishedCreatingPublication(String tagName) {
if (publicationType.equals(tagName)) {
publicationRepository.save(publicationStack.pop());
if (++counter % 1000 == 0) {
System.out.println("Counter = " + counter);
tx.success();
tx.close();
tx = graphDatabase.beginTx();
}
}
}
/**
* Removes the tag name of the last visited tag
* from the stack.
*/
private void removeNameOfLastVisitedTag() {
qNameStack.pop();
}
/**
* Store the tag name on the stack.
*
* @param tagName
*/
private void storeTagName(String tagName) {
qNameStack.push(tagName);
}
/**
* Create an empty publication to be filled with data.
*/
private void createEmptyPublication() {
publication = new Publication();
}
/**
* Checks if the parser is entering a title or booktitle tag. If so
* is the case, then a boolean flag is set.
*
* @param tagName the name of the current tag
*/
private void testIfLeavingTitle(String tagName) {
if ("title".equals(tagName) || "booktitle".equals(tagName))
insideTitle = false;
}
/**
* Checks if the parser is entering a title or booktitle tag. If so
* is the case, then a boolean flag is set.
*
* @param tagName the name of the current tag
*/
private void testIfEnteringTitle(String tagName) {
if ("title".equals(tagName) || "booktitle".equals(tagName))
insideTitle = true;
}
/**
* Checks if the current tag is one of:
* - article, inproceedings, proceedings, book, incollection, phdthesis, mastersthesis, www
* If the current tag is one of these, then the type of the current publication is set
* to the corresponding value.
*
* @param tagName the name of the current tag.
*/
private void testIfPublicationTag(String tagName) {
if ("article".equals(tagName)) {
publication.setType("article");
} else if ("inproceedings".equals(tagName)) {
publication.setType("inproceedings");
} else if ("proceedings".equals(tagName)) {
publication.setType("proceedings");
} else if ("book".equals(tagName)) {
publication.setType("book");
} else if ("incollection".equals(tagName)) {
publication.setType("incollection");
} else if ("phdthesis".equals(tagName)) {
publication.setType("phdthesis");
} else if ("mastersthesis".equals(tagName)) {
publication.setType("mastersthesis");
} else if ("www".equals(tagName)) {
publication.setType("www");
}
}
/**
* Checks if the tag has any attributes. If so, the existing attribute
* values are stored.
*
* A tag with attributes is one of:
* - article, inproceedings, proceedings, book, incollection, phdthesis, mastersthesis, www
*
* @param tagName the name of the current tag
* @param attributes the attributes of the current tag, if any
*/
private void testOnAttributes(String tagName, Attributes attributes) {
if (attributes.getLength() > 0) {
publicationType = tagName;
if (attributes.getValue("key") != null) {
publication.setKey(attributes.getValue("key"));
}
if (attributes.getValue("mdate") != null) {
publication.setMdate(attributes.getValue("mdate"));
}
if (attributes.getValue("publtype") != null) {
publication.setMdate(attributes.getValue("publtype"));
}
if (attributes.getValue("reviewid") != null) {
publication.setMdate(attributes.getValue("reviewid"));
}
if (attributes.getValue("rating") != null) {
publication.setMdate(attributes.getValue("rating"));
}
publicationStack.push(publication);
}
}
}
pom.xml:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.dblp</groupId>
<artifactId>graphdbcreator</artifactId>
<version>0.1.0</version>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.2.2.RELEASE</version>
</parent>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context</artifactId>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-tx</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.data</groupId>
<artifactId>spring-data-neo4j</artifactId>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>hibernate-validator</artifactId>
</dependency>
<dependency>
<groupId>javax.el</groupId>
<artifactId>javax.el-api</artifactId>
<version>2.2.4</version>
</dependency>
<dependency>
<groupId>xerces</groupId>
<artifactId>xercesImpl</artifactId>
<version>2.8.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>spring-releases</id>
<name>Spring Releases</name>
<url>https://repo.spring.io/libs-release</url>
</repository>
<repository>
<id>neo4j</id>
<name>Neo4j</name>
<url>http://m2.neo4j.org/</url>
</repository>
</repositories>
</project>
It seems my SAX handler was flawed. For instance, given a tag <author>Daniël Paulusma</author>
, the parser would make one call to the characters() method for "Dani", another call to characters() for "ë", and a third call to characters() for "l Paulusma". I found a simple solution to this problem here: SAX parsing and special characters.