Skip to content

Instantly share code, notes, and snippets.

@mjbommar
Created May 21, 2012 12:59
Show Gist options
  • Select an option

  • Save mjbommar/2762204 to your computer and use it in GitHub Desktop.

Select an option

Save mjbommar/2762204 to your computer and use it in GitHub Desktop.

Revisions

  1. mjbommar created this gist May 21, 2012.
    45 changes: 45 additions & 0 deletions plotHashtag2.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,45 @@

    # @author: Bommarito Consulting, LLC; http://michaelbommarito.com/
    # @date: May 21, 2012
    # @email: michael@bommaritollc.com
    # @packages: ggplot2, plyr

    # Clear and import.
    rm(list=ls())
    library(ggplot2)
    library(plyr)

    # Controlling parameters.
    hashtag <- "#nonato" # Hashtag for label purposes
    cutoff <- as.POSIXct("2012-01-11 00:00:00", tz="EDT") # First timestamp we will consider
    dt <- 30 # \Delta t, minutes

    # Load and pre-process tweets
    tweets <- unique(read.table('data/tweets.csv', sep="\t", quote="", comment.char="",
    stringsAsFactors=FALSE, header=FALSE, nrows=300000))
    names(tweets) <- c("id", "date", "user", "text")
    tweets$date <- as.POSIXct(strptime(tweets$date, "%a, %d %b %Y %H:%M:%S %z", tz = "GMT"))
    tweets <- tweets[which(tweets$date > cutoff), ]

    # Build date breaks
    minDate <- min(tweets$date)
    maxDate <- max(tweets$date) + 60 * dt
    dateBreaks <- seq(minDate, maxDate, by=60 * dt)

    # Use hist to count the number of tweets per bin; don't plot.
    tweetCount <- hist(tweets$date, breaks=dateBreaks, plot=FALSE)

    # Strip out the left endpoint of each bin.
    binBreaks <- tweetCount$breaks[1:length(tweetCount$breaks)-1]

    # Count number of unique tweeters per bin.
    userCount <- sapply(binBreaks, function(d) length(unique(tweets$user[which((tweets$date >= d) & (tweets$date <= d + 60*dt))])))

    # Plot data
    plotData <- data.frame(dates=dateBreaks[1:length(dateBreaks)-1], tweets=as.numeric(tweetCount$count), users=as.numeric(userCount))
    ggplot(plotData) +
    geom_bar(aes(x=dates, y=tweets, color=users), stat="identity") +
    scale_x_datetime("Date") +
    scale_y_continuous("Number of tweets") +
    opts(title="Number of tweets and unique users : #nonato")
    ggsave("fig/ts_tweet_user.jpg", width=12, height=8)