Skip to content

Instantly share code, notes, and snippets.

@voltek62
Created October 26, 2018 20:49
Show Gist options
  • Select an option

  • Save voltek62/04ec0836f1922a9d6802aabd068f0298 to your computer and use it in GitHub Desktop.

Select an option

Save voltek62/04ec0836f1922a9d6802aabd068f0298 to your computer and use it in GitHub Desktop.

Revisions

  1. voltek62 created this gist Oct 26, 2018.
    88 changes: 88 additions & 0 deletions display_internal_page_rank.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,88 @@
    #autoinstall packages
    packages <- c("igraph", "dplyr", "ggplot2", "magrittr")
    if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
    install.packages(setdiff(packages, rownames(installed.packages())))
    }

    # Enjoy learning ? https://dataseolabs.com

    library(igraph)
    library(dplyr)
    library(ggplot2)
    library(magrittr)

    map <- function(x, range = c(0,1), from.range=NA) {
    if(any(is.na(from.range))) from.range <- range(x, na.rm=TRUE)

    ## check if all values are the same
    if(!diff(from.range)) return(
    matrix(mean(range), ncol=ncol(x), nrow=nrow(x),
    dimnames = dimnames(x)))

    ## map to [0,1]
    x <- (x-from.range[1])
    x <- x/diff(from.range)
    ## handle single values
    if(diff(from.range) == 0) x <- 0

    ## map from [0,1] to [range]
    if (range[1]>range[2]) x <- 1-x
    x <- x*(abs(diff(range))) + min(range)

    x[x<min(range) | x>max(range)] <- NA

    x
    }

    # import ScreamingFrog : “Bulk Export” then “All Outlinks”
    ## skip first line
    DF <- read.csv2("all_outlinks.csv", header=TRUE, sep = ",", stringsAsFactors = F, skip=1 )
    ## we keep only link
    DF <- DF[DF$Type=="AHREF",]
    DF <- select(DF,Source,Destination)

    ## adapt colnames and rownames
    colnames(DF) <- c("From","To")
    rownames(DF) <- NULL

    # generate graph with data.frame
    graphObject = graph.data.frame(DF, directed = TRUE)
    # to run pagerank we need a simple, undirected graph
    graphObject = simplify(as.undirected(graphObject))

    # calculate pagerank
    pr <- page.rank(graphObject, directed= TRUE, damping = 0.85)

    # print graph with size node linked with pagerank
    plot(graphObject,
    layout=layout.fruchterman.reingold,
    vertex.size = map(pr$vector, c(1,20)),
    vertex.label = NA,
    vertex.label.color = "black",
    edge.arrow.size=.2
    )

    # calculate pagerank and store into your data.frame
    urls_pagerank <- pr %>%
    use_series("vector") %>%
    sort(decreasing = TRUE) %>%
    as.data.frame %>%
    set_colnames("raw.internal.pagerank")

    urls_pagerank$Address<-rownames(urls_pagerank)
    rownames(urls_pagerank) <- NULL

    urls_pagerank <- mutate(urls_pagerank, internal.pagerank = map(raw.internal.pagerank, c(1,10)))

    # print only your top URLs
    nbUrl <- 800
    graphObjectTopUrl <- subgraph.edges(graphObject, 1:nbUrl)

    # use tkplot for interactive graph
    prLimit <- as.numeric(map(pr$vector, c(1,20)))[1:(nbUrl+1)]

    tkplot(graphObjectTopUrl
    ,layout=layout.fruchterman.reingold
    ,vertex.size = prLimit
    ,vertex.label = NA
    )