== The World Cup Graph :neo4j-version: neo4j-2.1 :author: Mark Needham :twitter: @markhneedham === Initial Data Setup //setup //hide [source,cypher] ---- CREATE INDEX ON :Match(id); CREATE INDEX ON :WorldCup(name); CREATE INDEX ON :Stadium(name); CREATE INDEX ON :Phase(phase); CREATE INDEX ON :Country(name); CREATE INDEX ON :Time(time); CREATE INDEX ON :MatchNumber(value); USING PERIODIC COMMIT 1000 LOAD CSV WITH HEADERS FROM "https://dl.dropboxusercontent.com/u/7619809/matches.csv" AS csvLine WITH csvLine, toInt(csvLine.match_number) AS matchNumber WITH csvLine, CASE WHEN csvLine.phase = "" THEN CASE WHEN matchNumber <= 48 THEN "Group matches" WHEN matchNumber > 48 AND matchNumber <= 56 THEN "Round of 16" WHEN matchNumber > 56 AND matchNumber <= 60 THEN "Quarter-finals" WHEN matchNumber > 60 AND matchNumber <= 62 THEN "Semi-finals" WHEN matchNumber = 63 THEN "Third place" ELSE "Final" END ELSE csvLine.phase END AS phase, matchNumber MERGE (match:Match {id: csvLine.id}) SET match.h_score = csvLine.h_score, match.a_score = csvLine.a_score, match.attendance = csvLine.attendance, match.date = csvLine.date, match.description = csvLine.home + " vs. " + csvLine.away MERGE (host:Country {name: csvLine.host}) MERGE (home:Country {name: csvLine.home}) MERGE (match)-[:HOME_TEAM]->(home) MERGE (away:Country {name: csvLine.away}) MERGE (match)-[:AWAY_TEAM]->(away) MERGE (year:Year {year: toInt(csvLine.year)}) MERGE (worldCup:WorldCup {name: csvLine.world_cup}) MERGE (match)<-[:CONTAINS_MATCH]-(worldCup) MERGE (host)<-[:HOSTED_BY]-(worldCup) MERGE (year)<-[:IN_YEAR]-(worldCup) MERGE (stadium:Stadium {name: csvLine.stadium}) MERGE (match)-[:PLAYED_IN_STADIUM]->(stadium) MERGE (p:Phase {name: phase}) MERGE (match)-[:IN_PHASE]->(p) MERGE (mn:MatchNumber {value: matchNumber}) MERGE (match)-[:HAS_MATCH_NUMBER]->(mn) MERGE (time:Time {time: csvLine.time}) MERGE (match)-[:PLAYED_AT_TIME]->(time) RETURN count(*) as matches; ---- The graph is too large to visualize nicely, that's why here only a few matches from 2010 as example. [source, cypher] ---- MATCH (match)<-[:CONTAINS_MATCH]-(worldCup)-[:HOSTED_BY]-(host), (worldCup)-[:IN_YEAR]-(year:Year {year:2010}) RETURN * LIMIT 10 ---- //table === Which Worldcups are in this dataset? [source, cypher] ---- MATCH (stadium:Stadium)<-[:PLAYED_IN_STADIUM]-()<-[:CONTAINS_MATCH]-(worldCup:WorldCup)-[:HOSTED_BY]-(host:Country), (worldCup)-[:IN_YEAR]-(year:Year) RETURN worldCup.name,year.year,host.name,collect(distinct stadium.name) ORDER BY year.year ASC ---- //table === Find the stadiums that hosted the most World Cup matches and which World Cups those were in. [source, cypher] ---- MATCH (stadium:Stadium)<-[:PLAYED_IN_STADIUM]-()<-[:CONTAINS_MATCH]-(wc)-[:HOSTED_BY]-(host), (wc)-[:IN_YEAR]-(year) WITH stadium, host, COUNT(*) as count, COLLECT(DISTINCT year.year) AS years UNWIND years as year WITH stadium, host, count, year ORDER BY stadium.name, host.name, year RETURN stadium.name, host.name, COLLECT(year) AS years, count ORDER BY count DESC LIMIT 5 ---- //table === Which countries hosted the most World Cups and when? [source, cypher] ---- MATCH (host:Country)<-[:HOSTED_BY]-()-[:IN_YEAR]->(year) WITH host, COUNT(*) AS times, COLLECT(year.year) AS years UNWIND years AS year WITH host, times, year ORDER BY times DESC, year RETURN host.name, times, COLLECT(year) AS years ORDER BY times DESC ---- //table