Skip to content

Instantly share code, notes, and snippets.

@schaunwheeler
Last active December 11, 2020 16:41
Show Gist options
  • Select an option

  • Save schaunwheeler/5825002 to your computer and use it in GitHub Desktop.

Select an option

Save schaunwheeler/5825002 to your computer and use it in GitHub Desktop.

Revisions

  1. schaunwheeler revised this gist Jun 12, 2014. 1 changed file with 23 additions and 0 deletions.
    23 changes: 23 additions & 0 deletions xlsxToR.r
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,26 @@
    # The MIT License (MIT)
    #
    # Copyright (c) 2012 Schaun Jacob Wheeler
    #
    # Permission is hereby granted, free of charge, to any person obtaining a copy
    # of this software and associated documentation files (the "Software"), to deal
    # in the Software without restriction, including without limitation the rights
    # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    # copies of the Software, and to permit persons to whom the Software is
    # furnished to do so, subject to the following conditions:
    #
    # The above copyright notice and this permission notice shall be included in all
    # copies or substantial portions of the Software.
    #
    # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    # SOFTWARE.


    library(XML)
    library(plyr)
    library(pbapply)
  2. schaunwheeler revised this gist Oct 24, 2013. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion xlsxToR.r
    Original file line number Diff line number Diff line change
    @@ -5,7 +5,7 @@ library(pbapply)
    xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {

    temp_dir <- file.path(tempdir(), "xlsxToRtemp")
    dir.create(temp_dir)
    suppressWarnings(dir.create(temp_dir))

    file.copy(file, temp_dir)
    new_file <- list.files(temp_dir, full.name = TRUE, pattern = basename(file))
  3. schaunwheeler revised this gist Oct 24, 2013. 1 changed file with 6 additions and 5 deletions.
    11 changes: 6 additions & 5 deletions xlsxToR.r
    Original file line number Diff line number Diff line change
    @@ -91,15 +91,16 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {

    entries <- xmlParse(list.files(paste0(temp_dir, "/xl"), full.name = TRUE,
    pattern = "sharedStrings.xml$"))
    entries <- xpathSApply(entries, "//x:t", namespaces = "x", xmlValue)
    entries <- xpathSApply(entries, "//x:si", namespaces = "x", xmlValue)
    names(entries) <- seq_along(entries) - 1

    entries_match <- entries[match(worksheets$v, names(entries))]
    worksheets$v[worksheets$t == "s" & !is.na(worksheets$t)] <-
    entries_match[worksheets$t == "s"& !is.na(worksheets$t)]
    entries_match <- entries[
    match(worksheets$v[worksheets$t == "s" & !is.na(worksheets$t)],
    names(entries))]
    worksheets$v[worksheets$t == "s" & !is.na(worksheets$t)] <- entries_match
    worksheets$cols <- match(gsub("\\d", "", worksheets$r), LETTERS)
    worksheets$rows <- as.numeric(gsub("\\D", "", worksheets$r))

    if(!any(grepl("^s$", colnames(worksheets)))) {
    worksheets$s <- NA
    }
  4. schaunwheeler revised this gist Oct 22, 2013. 1 changed file with 3 additions and 1 deletion.
    4 changes: 3 additions & 1 deletion xlsxToR.r
    Original file line number Diff line number Diff line change
    @@ -29,7 +29,9 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {
    # Get names of sheets
    sheet_names <- xmlToList(xmlParse(list.files(
    paste0(temp_dir, "/xl"), full.name = TRUE, pattern = "workbook.xml")))
    sheet_names <- do.call("rbind", sheet_names$sheets)
    sheet_names <- rbind.fill(lapply(sheet_names$sheets, function(x) {
    as.data.frame(as.list(x), stringsAsFactors = FALSE)
    }))
    rownames(sheet_names) <- NULL
    sheet_names <- as.data.frame(sheet_names,stringsAsFactors = FALSE)
    sheet_names$id <- gsub("\\D", "", sheet_names$id)
  5. schaunwheeler revised this gist Oct 22, 2013. 1 changed file with 4 additions and 11 deletions.
    15 changes: 4 additions & 11 deletions xlsxToR.r
    Original file line number Diff line number Diff line change
    @@ -35,23 +35,16 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {
    sheet_names$id <- gsub("\\D", "", sheet_names$id)

    # Get column classes
    styles <- xmlToList(xmlParse(list.files(
    paste0(temp_dir, "/xl"), full.name = TRUE, pattern = "styles.xml")))
    styles <- styles$cellXfs[names(styles$cellXfs) == "xf"]
    styles <- lapply(styles, function(x) x$.attrs)
    styles <- styles[sapply(styles, function(x) {
    any(grepl("applyNumberFormat", names(x)))})]
    styles <- xmlParse(list.files(
    paste0(temp_dir, "/xl"), full.name = TRUE, pattern = "styles.xml"))
    styles <- xpathApply(styles, "//x:xf[@applyNumberFormat and @numFmtId]",
    namespaces = "x", xmlAttrs)
    styles <- lapply(styles, function(x) {
    x[grepl("applyNumberFormat|numFmtId", names(x))]})
    styles <- do.call("rbind", (lapply(styles,
    function(x) as.data.frame(as.list(x[c("applyNumberFormat", "numFmtId")]),
    stringsAsFactors = FALSE))))


    styles <- styles$cellXfs[
    sapply(styles$cellXfs, function(x) any(names(x) == "applyNumberFormat"))]


    if(!is.null(keep_sheets)) {
    sheet_names <- sheet_names[sheet_names$name %in% keep_sheets,]

  6. schaunwheeler revised this gist Oct 15, 2013. 1 changed file with 20 additions and 5 deletions.
    25 changes: 20 additions & 5 deletions xlsxToR.r
    Original file line number Diff line number Diff line change
    @@ -37,11 +37,20 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {
    # Get column classes
    styles <- xmlToList(xmlParse(list.files(
    paste0(temp_dir, "/xl"), full.name = TRUE, pattern = "styles.xml")))
    styles <- styles$cellXfs[names(styles$cellXfs) == "xf"]
    styles <- lapply(styles, function(x) x$.attrs)
    styles <- styles[sapply(styles, function(x) {
    any(grepl("applyNumberFormat", names(x)))})]
    styles <- lapply(styles, function(x) {
    x[grepl("applyNumberFormat|numFmtId", names(x))]})
    styles <- do.call("rbind", (lapply(styles,
    function(x) as.data.frame(as.list(x[c("applyNumberFormat", "numFmtId")]),
    stringsAsFactors = FALSE))))


    styles <- styles$cellXfs[
    sapply(styles$cellXfs, function(x) any(names(x) == "applyNumberFormat"))]
    styles <- do.call("rbind", lapply(styles,
    function(x) as.data.frame(as.list(x[c("applyNumberFormat", "numFmtId")]),
    stringsAsFactors = FALSE)))


    if(!is.null(keep_sheets)) {
    sheet_names <- sheet_names[sheet_names$name %in% keep_sheets,]
    @@ -113,16 +122,22 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {
    y_style <- y_style[-1,]
    }

    y_style <- sapply(y_style,
    function(x) ifelse(length(unique(x)) == 1, unique(x), NA))
    y_style <- sapply(y_style, function(x) {
    out <- names(which.max(table(x)))
    out[is.null(out)] <- NA
    out
    })

    if(length(styles) > 0) {
    y_style <- styles$numFmtId[match(y_style, styles$applyNumberFormat)]
    }

    y_style[y_style %in% 14:17] <- "date"
    y_style[y_style %in% c(18:21, 45:47)] <- "time"
    y_style[y_style %in% 22] <- "datetime"
    y_style[is.na(y_style) & !sapply(y, function(x)any(grepl("\\D", x)))] <- "numeric"
    y_style[is.na(y_style)] <- "character"
    y_style[!(y_style %in% c("date", "time", "datetime", "numeric"))] <- "character"

    y[] <- lapply(seq_along(y), function(i) {
    switch(y_style[i],
  7. schaunwheeler revised this gist Oct 7, 2013. 1 changed file with 8 additions and 5 deletions.
    13 changes: 8 additions & 5 deletions xlsxToR.r
    Original file line number Diff line number Diff line change
    @@ -4,8 +4,9 @@ library(pbapply)

    xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {

    temp_dir <- paste0(tempdir(), '/xlsxToRtemp')

    temp_dir <- file.path(tempdir(), "xlsxToRtemp")
    dir.create(temp_dir)

    file.copy(file, temp_dir)
    new_file <- list.files(temp_dir, full.name = TRUE, pattern = basename(file))
    unzip(new_file, exdir = temp_dir)
    @@ -46,7 +47,7 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {
    sheet_names <- sheet_names[sheet_names$name %in% keep_sheets,]

    }

    worksheet_paths <- list.files(
    paste0(temp_dir, "/xl/worksheets"),
    full.name = TRUE,
    @@ -58,7 +59,7 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {
    worksheets <- lapply(worksheet_paths, function(x) xmlRoot(xmlParse(x))[["sheetData"]])

    worksheets <- pblapply(seq_along(worksheets), function(i) {

    x <- xpathApply(worksheets[[i]], "//x:c", namespaces = "x", function(node) {
    c("v" = xmlValue(node[["v"]]), xmlAttrs(node))
    })
    @@ -88,7 +89,7 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {
    pattern = "sharedStrings.xml$"))
    entries <- xpathSApply(entries, "//x:t", namespaces = "x", xmlValue)
    names(entries) <- seq_along(entries) - 1

    entries_match <- entries[match(worksheets$v, names(entries))]
    worksheets$v[worksheets$t == "s" & !is.na(worksheets$t)] <-
    entries_match[worksheets$t == "s"& !is.na(worksheets$t)]
    @@ -136,6 +137,8 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {

    if(length(workbook) == 1) {
    workbook <- workbook[[1]]
    } else {
    names(workbook) <- sheet_names$name
    }

    workbook
  8. schaunwheeler revised this gist Sep 18, 2013. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions xlsxToR.r
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,6 @@
    require(XML)
    require(plyr)
    require(pbapply)
    library(XML)
    library(plyr)
    library(pbapply)

    xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {

  9. schaunwheeler revised this gist Sep 18, 2013. 1 changed file with 5 additions and 5 deletions.
    10 changes: 5 additions & 5 deletions xlsxToR.r
    Original file line number Diff line number Diff line change
    @@ -17,7 +17,7 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {
    # is handled the same across both platforms. I've kept the original code here
    # commented out in case it can be of use in the future.
    # mac <- xmlToList(xmlParse(list.files(
    # paste0(tempdir(), "/docProps"), full.name = TRUE, pattern = "app.xml")))
    # paste0(temp_dir, "/docProps"), full.name = TRUE, pattern = "app.xml")))
    # mac <- grepl("Macintosh", mac$Application)
    # if(mac) {
    # os_origin <- "1899-12-30" # documentation says should be "1904-01-01"
    @@ -27,15 +27,15 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {

    # Get names of sheets
    sheet_names <- xmlToList(xmlParse(list.files(
    paste0(tempdir(), "/xl"), full.name = TRUE, pattern = "workbook.xml")))
    paste0(temp_dir, "/xl"), full.name = TRUE, pattern = "workbook.xml")))
    sheet_names <- do.call("rbind", sheet_names$sheets)
    rownames(sheet_names) <- NULL
    sheet_names <- as.data.frame(sheet_names,stringsAsFactors = FALSE)
    sheet_names$id <- gsub("\\D", "", sheet_names$id)

    # Get column classes
    styles <- xmlToList(xmlParse(list.files(
    paste0(tempdir(), "/xl"), full.name = TRUE, pattern = "styles.xml")))
    paste0(temp_dir, "/xl"), full.name = TRUE, pattern = "styles.xml")))
    styles <- styles$cellXfs[
    sapply(styles$cellXfs, function(x) any(names(x) == "applyNumberFormat"))]
    styles <- do.call("rbind", lapply(styles,
    @@ -48,7 +48,7 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {
    }

    worksheet_paths <- list.files(
    paste0(tempdir(), "/xl/worksheets"),
    paste0(temp_dir, "/xl/worksheets"),
    full.name = TRUE,
    pattern = paste0(
    "sheet(",
    @@ -84,7 +84,7 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {
    worksheets <- do.call("rbind.fill",
    worksheets[sapply(worksheets, class) == "data.frame"])

    entries <- xmlParse(list.files(paste0(tempdir(), "/xl"), full.name = TRUE,
    entries <- xmlParse(list.files(paste0(temp_dir, "/xl"), full.name = TRUE,
    pattern = "sharedStrings.xml$"))
    entries <- xpathSApply(entries, "//x:t", namespaces = "x", xmlValue)
    names(entries) <- seq_along(entries) - 1
  10. schaunwheeler revised this gist Sep 18, 2013. 1 changed file with 5 additions and 3 deletions.
    8 changes: 5 additions & 3 deletions xlsxToR.r
    Original file line number Diff line number Diff line change
    @@ -4,9 +4,11 @@ require(pbapply)

    xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {

    file.copy(file, tempdir())
    new_file <- list.files(tempdir(), full.name = TRUE, pattern = basename(file))
    unzip(new_file, exdir = tempdir())
    temp_dir <- paste0(tempdir(), '/xlsxToRtemp')

    file.copy(file, temp_dir)
    new_file <- list.files(temp_dir, full.name = TRUE, pattern = basename(file))
    unzip(new_file, exdir = temp_dir)

    # Get OS
    # These lines are included because R documentation states that Excel handles
  11. schaunwheeler revised this gist Sep 18, 2013. 1 changed file with 17 additions and 12 deletions.
    29 changes: 17 additions & 12 deletions xlsxToR.r
    Original file line number Diff line number Diff line change
    @@ -1,22 +1,27 @@
    require(XML)
    require(plyr)
    require(pbapply)

    xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {

    require(XML)
    require(plyr)
    require(pbapply)

    file.copy(file, tempdir())
    new_file <- list.files(tempdir(), full.name = TRUE, pattern = basename(file))
    unzip(new_file, exdir = tempdir())

    # Get OS
    mac <- xmlToList(xmlParse(list.files(
    paste0(tempdir(), "/docProps"), full.name = TRUE, pattern = "app.xml")))
    mac <- grepl("Macintosh", mac$Application)
    if(mac) {
    os_origin <- "1899-12-30" # documentation says should be "1904-01-01"
    } else {
    os_origin <- "1899-12-30"
    }
    # These lines are included because R documentation states that Excel handles
    # date origins differently on Mac than on Windows. However, manual inspection
    # of Excel files created on Windows and Mac indicated that in fact the origin
    # is handled the same across both platforms. I've kept the original code here
    # commented out in case it can be of use in the future.
    # mac <- xmlToList(xmlParse(list.files(
    # paste0(tempdir(), "/docProps"), full.name = TRUE, pattern = "app.xml")))
    # mac <- grepl("Macintosh", mac$Application)
    # if(mac) {
    # os_origin <- "1899-12-30" # documentation says should be "1904-01-01"
    # } else {
    # os_origin <- "1899-12-30"
    # }

    # Get names of sheets
    sheet_names <- xmlToList(xmlParse(list.files(
  12. schaunwheeler revised this gist Aug 9, 2013. 1 changed file with 1 addition and 8 deletions.
    9 changes: 1 addition & 8 deletions xlsxToR.r
    Original file line number Diff line number Diff line change
    @@ -6,14 +6,7 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {

    file.copy(file, tempdir())
    new_file <- list.files(tempdir(), full.name = TRUE, pattern = basename(file))
    file_pattern <- gsub(".*?/(.*?)\\.xlsx", "\\1", file)
    new_file_rename <- gsub(
    paste0(file_pattern, "(\\.xlsx)$"),
    paste0(file_pattern, ".zip"),
    new_file)
    file.rename(new_file, new_file_rename)

    unzip(new_file_rename, exdir = tempdir())
    unzip(new_file, exdir = tempdir())

    # Get OS
    mac <- xmlToList(xmlParse(list.files(
  13. schaunwheeler revised this gist Aug 9, 2013. 1 changed file with 6 additions and 3 deletions.
    9 changes: 6 additions & 3 deletions xlsxToR.r
    Original file line number Diff line number Diff line change
    @@ -4,11 +4,14 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {
    require(plyr)
    require(pbapply)

    suppressWarnings(file.remove(tempdir()))
    file.copy(file, tempdir())
    new_file <- list.files(tempdir(), full.name = TRUE, pattern = basename(file))
    new_file_rename <- gsub("xlsx$", "zip", new_file)
    file.rename(new_file, new_file_rename)
    file_pattern <- gsub(".*?/(.*?)\\.xlsx", "\\1", file)
    new_file_rename <- gsub(
    paste0(file_pattern, "(\\.xlsx)$"),
    paste0(file_pattern, ".zip"),
    new_file)
    file.rename(new_file, new_file_rename)

    unzip(new_file_rename, exdir = tempdir())

  14. schaunwheeler revised this gist Jul 3, 2013. 1 changed file with 42 additions and 24 deletions.
    66 changes: 42 additions & 24 deletions xlsxToR.r
    Original file line number Diff line number Diff line change
    @@ -2,6 +2,7 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {

    require(XML)
    require(plyr)
    require(pbapply)

    suppressWarnings(file.remove(tempdir()))
    file.copy(file, tempdir())
    @@ -10,7 +11,7 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {
    file.rename(new_file, new_file_rename)

    unzip(new_file_rename, exdir = tempdir())

    # Get OS
    mac <- xmlToList(xmlParse(list.files(
    paste0(tempdir(), "/docProps"), full.name = TRUE, pattern = "app.xml")))
    @@ -27,6 +28,7 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {
    sheet_names <- do.call("rbind", sheet_names$sheets)
    rownames(sheet_names) <- NULL
    sheet_names <- as.data.frame(sheet_names,stringsAsFactors = FALSE)
    sheet_names$id <- gsub("\\D", "", sheet_names$id)

    # Get column classes
    styles <- xmlToList(xmlParse(list.files(
    @@ -37,43 +39,59 @@ xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {
    function(x) as.data.frame(as.list(x[c("applyNumberFormat", "numFmtId")]),
    stringsAsFactors = FALSE)))

    worksheet_paths <- list.files(paste0(tempdir(), "/xl/worksheets"),
    full.name = TRUE, pattern = "xml$")
    if(!is.null(keep_sheets)) {
    sheet_names <- sheet_names[sheet_names$name %in% keep_sheets,]

    }

    worksheet_paths <- list.files(
    paste0(tempdir(), "/xl/worksheets"),
    full.name = TRUE,
    pattern = paste0(
    "sheet(",
    paste(sheet_names$id, collapse = "|"),
    ")\\.xml$"))

    worksheets <- lapply(worksheet_paths, function(x) xmlRoot(xmlParse(x))[["sheetData"]])

    worksheets <- lapply(worksheet_paths, function(x) xmlToList(xmlParse(x))$sheetData)
    worksheets <- lapply(seq_along(worksheets), function(i) {
    x <- lapply(worksheets[[i]], function(y) {
    y <- y[names(y) == "c"]
    y <- lapply(y, function(z) {
    z <- unlist(z)
    names(z) <- gsub("\\.?attrs\\.?", "", names(z))
    as.data.frame(as.list(z), stringsAsFactors = FALSE)
    })
    do.call("rbind.fill", y)
    worksheets <- pblapply(seq_along(worksheets), function(i) {

    x <- xpathApply(worksheets[[i]], "//x:c", namespaces = "x", function(node) {
    c("v" = xmlValue(node[["v"]]), xmlAttrs(node))
    })
    x <- do.call("rbind.fill", x)
    x$sheet <- sheet_names[sheet_names$sheetId == i, "name"]

    if(length(x) > 0) {

    x_rows <- unlist(lapply(seq_along(x), function(i) rep(i, length(x[[i]]))))
    x <- unlist(x)

    x <- reshape(
    data.frame(
    "row" = x_rows,
    "ind" = names(x),
    "value" = x,
    stringsAsFactors = FALSE),
    idvar = "row", timevar = "ind", direction = "wide")

    x$sheet <- sheet_names[sheet_names$id == i, "name"]
    colnames(x) <- gsub("^value\\.", "", colnames(x))
    }
    x
    })
    worksheets <- do.call("rbind.fill",
    worksheets[sapply(worksheets, class) == "data.frame"])

    entries <- xmlToList(xmlParse(list.files(paste0(tempdir(), "/xl"),
    full.name = TRUE, pattern = "sharedStrings.xml$")))
    entries <- unlist(entries)
    entries <- entries[names(entries) == "si.t"]
    entries <- xmlParse(list.files(paste0(tempdir(), "/xl"), full.name = TRUE,
    pattern = "sharedStrings.xml$"))
    entries <- xpathSApply(entries, "//x:t", namespaces = "x", xmlValue)
    names(entries) <- seq_along(entries) - 1

    entries_match <- entries[match(worksheets$v, names(entries))]
    worksheets$v[worksheets$t == "s" & !is.na(worksheets$t)] <-
    entries_match[worksheets$t == "s"& !is.na(worksheets$t)]
    worksheets$cols <- match(gsub("\\d", "", worksheets$r), LETTERS)
    worksheets$rows <- as.numeric(gsub("\\D", "", worksheets$r))

    if(!is.null(keep_sheets)) {
    worksheets <- worksheets[sheet %in% keep_sheets,]
    }

    if(!any(grepl("^s$", colnames(worksheets)))) {
    worksheets$s <- NA
    }
  15. schaunwheeler revised this gist Jun 23, 2013. 1 changed file with 36 additions and 16 deletions.
    52 changes: 36 additions & 16 deletions xlsxToR.r
    Original file line number Diff line number Diff line change
    @@ -1,26 +1,36 @@
    xlsxToR <- function(file) {

    xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {
    require(XML)
    require(plyr)

    suppressWarnings(file.remove(tempdir()))
    file.copy(file, tempdir())
    new_file <- list.files(tempdir(), full.name = TRUE, pattern = basename(file))
    new_file_rename <- gsub("xlsx$", "zip", new_file)
    file.rename(new_file, new_file_rename)

    unzip(new_file_rename, exdir = tempdir())

    # Get OS
    mac <- xmlToList(xmlParse(list.files(
    paste0(tempdir(), "/docProps"), full.name = TRUE, pattern = "app.xml")))
    mac <- grepl("Macintosh", mac$Application)
    if(mac) {
    os_origin <- "1899-12-30" # documentation says should be "1904-01-01"
    } else {
    os_origin <- "1899-12-30"
    }

    # Get names of sheets
    sheet_names <- xmlToList(list.files(
    paste0(tempdir(), "/xl"), full.name = TRUE, pattern = "workbook.xml"))
    sheet_names <- xmlToList(xmlParse(list.files(
    paste0(tempdir(), "/xl"), full.name = TRUE, pattern = "workbook.xml")))
    sheet_names <- do.call("rbind", sheet_names$sheets)
    rownames(sheet_names) <- NULL
    sheet_names <- as.data.frame(sheet_names,stringsAsFactors = FALSE)

    # Get column classes
    styles <- xmlToList(list.files(
    paste0(tempdir(), "/xl"), full.name = TRUE, pattern = "styles.xml"))
    styles <- xmlToList(xmlParse(list.files(
    paste0(tempdir(), "/xl"), full.name = TRUE, pattern = "styles.xml")))
    styles <- styles$cellXfs[
    sapply(styles$cellXfs, function(x) any(names(x) == "applyNumberFormat"))]
    styles <- do.call("rbind", lapply(styles,
    @@ -30,7 +40,7 @@ xlsxToR <- function(file) {
    worksheet_paths <- list.files(paste0(tempdir(), "/xl/worksheets"),
    full.name = TRUE, pattern = "xml$")

    worksheets <- lapply(worksheet_paths, function(x) xmlToList(x)$sheetData)
    worksheets <- lapply(worksheet_paths, function(x) xmlToList(xmlParse(x))$sheetData)
    worksheets <- lapply(seq_along(worksheets), function(i) {
    x <- lapply(worksheets[[i]], function(y) {
    y <- y[names(y) == "c"]
    @@ -48,8 +58,8 @@ xlsxToR <- function(file) {
    worksheets <- do.call("rbind.fill",
    worksheets[sapply(worksheets, class) == "data.frame"])

    entries <- xmlToList(list.files(paste0(tempdir(), "/xl"),
    full.name = TRUE, pattern = "sharedStrings.xml$"))
    entries <- xmlToList(xmlParse(list.files(paste0(tempdir(), "/xl"),
    full.name = TRUE, pattern = "sharedStrings.xml$")))
    entries <- unlist(entries)
    entries <- entries[names(entries) == "si.t"]
    names(entries) <- seq_along(entries) - 1
    @@ -60,22 +70,32 @@ xlsxToR <- function(file) {
    worksheets$cols <- match(gsub("\\d", "", worksheets$r), LETTERS)
    worksheets$rows <- as.numeric(gsub("\\D", "", worksheets$r))

    if(!is.null(keep_sheets)) {
    worksheets <- worksheets[sheet %in% keep_sheets,]
    }

    if(!any(grepl("^s$", colnames(worksheets)))) {
    worksheets$s <- NA
    }

    workbook <- lapply(unique(worksheets$sheet), function(x) {
    y <- worksheets[worksheets$sheet == x,]
    y_style <- as.data.frame(tapply(y$s, list(y$rows, y$cols), identity),
    stringsAsFactors = FALSE)
    y <- as.data.frame(tapply(y$v, list(y$rows, y$cols), identity),
    stringsAsFactors = FALSE)

    if(all(!is.na(y[1,]))) {
    if(header) {
    colnames(y) <- y[1,]
    y <- y[-1,]
    y_style <- y_style[-1,]
    }

    y_style <- sapply(y_style,
    function(x) ifelse(length(unique(x)) == 1, unique(x), NA))
    y_style <- styles$numFmtId[match(y_style, styles$applyNumberFormat)]
    if(length(styles) > 0) {
    y_style <- styles$numFmtId[match(y_style, styles$applyNumberFormat)]
    }
    y_style[y_style %in% 14:17] <- "date"
    y_style[y_style %in% c(18:21, 45:47)] <- "time"
    y_style[y_style %in% 22] <- "datetime"
    @@ -86,9 +106,9 @@ xlsxToR <- function(file) {
    switch(y_style[i],
    character = y[,i],
    numeric = as.numeric(y[,i]),
    date = as.Date(as.numeric(y[,i]), origin="1899-12-30"),
    time = strftime(as.POSIXct(as.numeric(y[,i]), origin="1899-12-30"), format = "%H:%M:%S"),
    datetime = as.POSIXct(as.numeric(y[,i]), origin="1899-12-30"))
    date = as.Date(as.numeric(y[,i]), origin = os_origin),
    time = strftime(as.POSIXct(as.numeric(y[,i]), origin = os_origin), format = "%H:%M:%S"),
    datetime = as.POSIXct(as.numeric(y[,i]), origin = os_origin))
    })
    y
    })
    @@ -98,4 +118,4 @@ xlsxToR <- function(file) {
    }

    workbook
    }
    }
  16. schaunwheeler revised this gist Jun 21, 2013. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions xlsxToR.r
    Original file line number Diff line number Diff line change
    @@ -87,8 +87,8 @@ xlsxToR <- function(file) {
    character = y[,i],
    numeric = as.numeric(y[,i]),
    date = as.Date(as.numeric(y[,i]), origin="1899-12-30"),
    time = as.Date(as.numeric(y[,i]), origin="1899-12-30"),
    datetime = as.Date(as.numeric(y[,i]), origin="1899-12-30"))
    time = strftime(as.POSIXct(as.numeric(y[,i]), origin="1899-12-30"), format = "%H:%M:%S"),
    datetime = as.POSIXct(as.numeric(y[,i]), origin="1899-12-30"))
    })
    y
    })
  17. schaunwheeler renamed this gist Jun 20, 2013. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  18. schaunwheeler created this gist Jun 20, 2013.
    101 changes: 101 additions & 0 deletions xlsxToR
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,101 @@
    xlsxToR <- function(file) {

    require(XML)
    require(plyr)

    suppressWarnings(file.remove(tempdir()))
    file.copy(file, tempdir())
    new_file <- list.files(tempdir(), full.name = TRUE, pattern = basename(file))
    new_file_rename <- gsub("xlsx$", "zip", new_file)
    file.rename(new_file, new_file_rename)

    unzip(new_file_rename, exdir = tempdir())

    # Get names of sheets
    sheet_names <- xmlToList(list.files(
    paste0(tempdir(), "/xl"), full.name = TRUE, pattern = "workbook.xml"))
    sheet_names <- do.call("rbind", sheet_names$sheets)
    rownames(sheet_names) <- NULL
    sheet_names <- as.data.frame(sheet_names,stringsAsFactors = FALSE)

    # Get column classes
    styles <- xmlToList(list.files(
    paste0(tempdir(), "/xl"), full.name = TRUE, pattern = "styles.xml"))
    styles <- styles$cellXfs[
    sapply(styles$cellXfs, function(x) any(names(x) == "applyNumberFormat"))]
    styles <- do.call("rbind", lapply(styles,
    function(x) as.data.frame(as.list(x[c("applyNumberFormat", "numFmtId")]),
    stringsAsFactors = FALSE)))

    worksheet_paths <- list.files(paste0(tempdir(), "/xl/worksheets"),
    full.name = TRUE, pattern = "xml$")

    worksheets <- lapply(worksheet_paths, function(x) xmlToList(x)$sheetData)
    worksheets <- lapply(seq_along(worksheets), function(i) {
    x <- lapply(worksheets[[i]], function(y) {
    y <- y[names(y) == "c"]
    y <- lapply(y, function(z) {
    z <- unlist(z)
    names(z) <- gsub("\\.?attrs\\.?", "", names(z))
    as.data.frame(as.list(z), stringsAsFactors = FALSE)
    })
    do.call("rbind.fill", y)
    })
    x <- do.call("rbind.fill", x)
    x$sheet <- sheet_names[sheet_names$sheetId == i, "name"]
    x
    })
    worksheets <- do.call("rbind.fill",
    worksheets[sapply(worksheets, class) == "data.frame"])

    entries <- xmlToList(list.files(paste0(tempdir(), "/xl"),
    full.name = TRUE, pattern = "sharedStrings.xml$"))
    entries <- unlist(entries)
    entries <- entries[names(entries) == "si.t"]
    names(entries) <- seq_along(entries) - 1

    entries_match <- entries[match(worksheets$v, names(entries))]
    worksheets$v[worksheets$t == "s" & !is.na(worksheets$t)] <-
    entries_match[worksheets$t == "s"& !is.na(worksheets$t)]
    worksheets$cols <- match(gsub("\\d", "", worksheets$r), LETTERS)
    worksheets$rows <- as.numeric(gsub("\\D", "", worksheets$r))

    workbook <- lapply(unique(worksheets$sheet), function(x) {
    y <- worksheets[worksheets$sheet == x,]
    y_style <- as.data.frame(tapply(y$s, list(y$rows, y$cols), identity),
    stringsAsFactors = FALSE)
    y <- as.data.frame(tapply(y$v, list(y$rows, y$cols), identity),
    stringsAsFactors = FALSE)

    if(all(!is.na(y[1,]))) {
    colnames(y) <- y[1,]
    y <- y[-1,]
    y_style <- y_style[-1,]
    }

    y_style <- sapply(y_style,
    function(x) ifelse(length(unique(x)) == 1, unique(x), NA))
    y_style <- styles$numFmtId[match(y_style, styles$applyNumberFormat)]
    y_style[y_style %in% 14:17] <- "date"
    y_style[y_style %in% c(18:21, 45:47)] <- "time"
    y_style[y_style %in% 22] <- "datetime"
    y_style[is.na(y_style) & !sapply(y, function(x)any(grepl("\\D", x)))] <- "numeric"
    y_style[is.na(y_style)] <- "character"

    y[] <- lapply(seq_along(y), function(i) {
    switch(y_style[i],
    character = y[,i],
    numeric = as.numeric(y[,i]),
    date = as.Date(as.numeric(y[,i]), origin="1899-12-30"),
    time = as.Date(as.numeric(y[,i]), origin="1899-12-30"),
    datetime = as.Date(as.numeric(y[,i]), origin="1899-12-30"))
    })
    y
    })

    if(length(workbook) == 1) {
    workbook <- workbook[[1]]
    }

    workbook
    }