library(XML) library(RCurl) library(ggplot2) results <- c() ## Loop through the 30 pages of player data for(i in 1:30){ theURL <- paste("http://www.nhl.com/ice/playerstats.htm?fetchKey=20122ALLSASAll&viewName=summary&sort=points&pg=", i, sep="") webpage <- getURL(theURL) h<-htmlParse(webpage) stats <- xmlToDataFrame(nodes = getNodeSet(h, "//tbody//tr"))[,-1] ## Grab the column names only on the first iteration if(i == 1){ nodes<-getNodeSet(h, "//table [@summary='2011-2012 - Regular Season - Skater - Summary - Points'] //thead//tr//th//a[@title]") cols <- as.character(xmlToDataFrame(nodes)[,1]) column.names <- gsub("\\n", "", cols) ## Append columns since any sorted column and Team column do not appear in a structured format in the HTML column.names<-append(column.names, "Team", after=1) column.names<-append(column.names, "P", after=6) ## Clean up column names so they are R-friendly column.names[8] <- "Plus.Minus" column.names[15] <- "Shooting.Percentage" column.names[16] <- "Time.On.Ice.Per.Game" column.names[17] <- "Avg.Shifts.Per.Game" column.names[18] <- "Faceoff.Win.Percentage" results <- rbind(results, stats) colnames(results) <- column.names } colnames(stats) <- column.names results <- rbind(results, stats) } ## Remove plus signs from +/- so we can treat it as a number results$Plus.Minus <- as.numeric(gsub("\\+", "", results$Plus.Minus)) ## Format factors as numeric data types results[,c(4:15, 17:18)] <- apply(results[,c(4:15, 17:18)], 2, function(x) as.numeric(as.character(x))) results <- results[match(unique(results$Player ), results$Player),] ## We only care about the first Team listed and not if that player was on multiple teams in 2011-12 results$Team <- gsub("\\,\\s+\\w+", "", as.character(s.results$Team)) ## Pull out a team to visualize t.results <- subset(s.results, Team == "BOS") ## Plot the data and save in a PDF pdf(file="Bruins.pdf", width=11, height=8) ggplot(t.results, aes(x=Plus.Minus, y=P, size=Avg.Shifts.Per.Game, colour=Pos, label=Player)) + geom_text() + labs(x="+/-", y="Points", title= t.results$Team) dev.off()