Skip to content

Instantly share code, notes, and snippets.

@inkrement
Last active March 12, 2024 11:44
Show Gist options
  • Select an option

  • Save inkrement/497c3be06f002be6dd1a842a4475f2dc to your computer and use it in GitHub Desktop.

Select an option

Save inkrement/497c3be06f002be6dd1a842a4475f2dc to your computer and use it in GitHub Desktop.

Revisions

  1. inkrement revised this gist Aug 8, 2019. 1 changed file with 10 additions and 3 deletions.
    13 changes: 10 additions & 3 deletions baseR_embeddings.R
    Original file line number Diff line number Diff line change
    @@ -23,14 +23,21 @@ load_embedding <- function(file_path){
    embeddings_env
    }

    cosine_similarity <- function(a,b){
    # assuming unit vectors
    # the cosine is just the dot-product
    a %*% b
    }


    most_similar <- function(embeddings, ref_item, n_top = 10){
    # calculate cos similarity to ref_item for all elements
    cos_sims <- eapply(embeddings, cos_sim, b = ref_item)

    cos_sims <- eapply(embeddings, cosine_similarity, b = ref_item)
    # only look at cos values smaller than 1
    # this will ignore the same element
    cos_sims <- cos_sims[cos_sims < 1]

    # return top elements
    cos_sims[order(unlist(cos_sims),decreasing=TRUE)][1:n_top]
    }
  2. inkrement revised this gist Aug 7, 2019. No changes.
  3. inkrement revised this gist Aug 7, 2019. 1 changed file with 7 additions and 7 deletions.
    14 changes: 7 additions & 7 deletions baseR_embeddings.R
    Original file line number Diff line number Diff line change
    @@ -5,7 +5,7 @@ load_embedding <- function(file_path){

    # create new environment
    embeddings_env <- new.env(hash = TRUE, parent = emptyenv())

    # this function is used to convert vectors to unit vectors
    # by dividing their components by vector length
    normalize_vector <- function(a){
    @@ -14,10 +14,10 @@ load_embedding <- function(file_path){

    # iterate through the whole file line by line
    for (i in 1:length(lines)) {
    line <- lines[[i]]
    values <- strsplit(line, " ")[[1]]
    label <- values[[1]]
    embeddings_env[[label]] <- normalize_vector(as.double(values[-1]))
    line <- lines[[i]]
    values <- strsplit(line, " ")[[1]]
    label <- values[[1]]
    embeddings_env[[label]] <- normalize_vector(as.double(values[-1]))
    }

    embeddings_env
    @@ -26,11 +26,11 @@ load_embedding <- function(file_path){
    most_similar <- function(embeddings, ref_item, n_top = 10){
    # calculate cos similarity to ref_item for all elements
    cos_sims <- eapply(embeddings, cos_sim, b = ref_item)

    # only look at cos values smaller than 1
    # this will ignore the same element
    cos_sims <- cos_sims[cos_sims < 1]

    # return top elements
    cos_sims[order(unlist(cos_sims),decreasing=TRUE)][1:n_top]
    }
  4. inkrement revised this gist Aug 7, 2019. No changes.
  5. inkrement created this gist Aug 7, 2019.
    36 changes: 36 additions & 0 deletions baseR_embeddings.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,36 @@
    load_embedding <- function(file_path){

    # load full file
    lines <- readLines(file_path)

    # create new environment
    embeddings_env <- new.env(hash = TRUE, parent = emptyenv())

    # this function is used to convert vectors to unit vectors
    # by dividing their components by vector length
    normalize_vector <- function(a){
    a/sqrt(sum(a**2))
    }

    # iterate through the whole file line by line
    for (i in 1:length(lines)) {
    line <- lines[[i]]
    values <- strsplit(line, " ")[[1]]
    label <- values[[1]]
    embeddings_env[[label]] <- normalize_vector(as.double(values[-1]))
    }

    embeddings_env
    }

    most_similar <- function(embeddings, ref_item, n_top = 10){
    # calculate cos similarity to ref_item for all elements
    cos_sims <- eapply(embeddings, cos_sim, b = ref_item)

    # only look at cos values smaller than 1
    # this will ignore the same element
    cos_sims <- cos_sims[cos_sims < 1]

    # return top elements
    cos_sims[order(unlist(cos_sims),decreasing=TRUE)][1:n_top]
    }