Created
October 26, 2018 20:49
-
-
Save voltek62/04ec0836f1922a9d6802aabd068f0298 to your computer and use it in GitHub Desktop.
Revisions
-
voltek62 created this gist
Oct 26, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,88 @@ #autoinstall packages packages <- c("igraph", "dplyr", "ggplot2", "magrittr") if (length(setdiff(packages, rownames(installed.packages()))) > 0) { install.packages(setdiff(packages, rownames(installed.packages()))) } # Enjoy learning ? https://dataseolabs.com library(igraph) library(dplyr) library(ggplot2) library(magrittr) map <- function(x, range = c(0,1), from.range=NA) { if(any(is.na(from.range))) from.range <- range(x, na.rm=TRUE) ## check if all values are the same if(!diff(from.range)) return( matrix(mean(range), ncol=ncol(x), nrow=nrow(x), dimnames = dimnames(x))) ## map to [0,1] x <- (x-from.range[1]) x <- x/diff(from.range) ## handle single values if(diff(from.range) == 0) x <- 0 ## map from [0,1] to [range] if (range[1]>range[2]) x <- 1-x x <- x*(abs(diff(range))) + min(range) x[x<min(range) | x>max(range)] <- NA x } # import ScreamingFrog : “Bulk Export” then “All Outlinks” ## skip first line DF <- read.csv2("all_outlinks.csv", header=TRUE, sep = ",", stringsAsFactors = F, skip=1 ) ## we keep only link DF <- DF[DF$Type=="AHREF",] DF <- select(DF,Source,Destination) ## adapt colnames and rownames colnames(DF) <- c("From","To") rownames(DF) <- NULL # generate graph with data.frame graphObject = graph.data.frame(DF, directed = TRUE) # to run pagerank we need a simple, undirected graph graphObject = simplify(as.undirected(graphObject)) # calculate pagerank pr <- page.rank(graphObject, directed= TRUE, damping = 0.85) # print graph with size node linked with pagerank plot(graphObject, layout=layout.fruchterman.reingold, vertex.size = map(pr$vector, c(1,20)), vertex.label = NA, vertex.label.color = "black", edge.arrow.size=.2 ) # calculate pagerank and store into your data.frame urls_pagerank <- pr %>% use_series("vector") %>% sort(decreasing = TRUE) %>% as.data.frame %>% set_colnames("raw.internal.pagerank") urls_pagerank$Address<-rownames(urls_pagerank) rownames(urls_pagerank) <- NULL urls_pagerank <- mutate(urls_pagerank, internal.pagerank = map(raw.internal.pagerank, c(1,10))) # print only your top URLs nbUrl <- 800 graphObjectTopUrl <- subgraph.edges(graphObject, 1:nbUrl) # use tkplot for interactive graph prLimit <- as.numeric(map(pr$vector, c(1,20)))[1:(nbUrl+1)] tkplot(graphObjectTopUrl ,layout=layout.fruchterman.reingold ,vertex.size = prLimit ,vertex.label = NA )