R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this as seen below

R Markdown & life advice to past self analyzing datasets

Trust me when I say it will make your life much easier if you keep a running document with your analyses and explanations of what you did and why you were doing it, especially if you are juggling multiple projects and need to put something down for a month or so.

I in the past had the “organisational” strategy of having a giant .R file with all my R code for a project, and I would occaisionally update a separate word document with some of the better plots and try to write them up.

This is a bad strategy, don’t be past me

Having a single document where you mix code, figures, and explanations will make things much easier.

library(ghostr)
library(nycflights13)
library(ggplot2)
library(readr)
library(dplyr)
data('ghost_sightings')
data('flights')

There are 5 dplyr verbs

dplyr examples come from http://sharpsightlabs.com/blog/2014/12/11/dplyr-intro-data-manipulation-with-r/

filter

df.diamonds_ideal <- filter(diamonds, cut=="Ideal")
head(df.diamonds_ideal)
## # A tibble: 6 x 10
##   carat   cut color clarity depth table price     x     y     z
##   <dbl> <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
## 2  0.23 Ideal     J     VS1  62.8    56   340  3.93  3.90  2.46
## 3  0.31 Ideal     J     SI2  62.2    54   344  4.35  4.37  2.71
## 4  0.30 Ideal     I     SI2  62.0    54   348  4.31  4.34  2.68
## 5  0.33 Ideal     I     SI2  61.8    55   403  4.49  4.51  2.78
## 6  0.33 Ideal     I     SI2  61.2    56   403  4.49  4.50  2.75

select()

df.diamonds_ideal <- select(df.diamonds_ideal, carat, cut, color, price, clarity)

head(df.diamonds_ideal)
## # A tibble: 6 x 5
##   carat   cut color price clarity
##   <dbl> <ord> <ord> <int>   <ord>
## 1  0.23 Ideal     E   326     SI2
## 2  0.23 Ideal     J   340     VS1
## 3  0.31 Ideal     J   344     SI2
## 4  0.30 Ideal     I   348     SI2
## 5  0.33 Ideal     I   403     SI2
## 6  0.33 Ideal     I   403     SI2

mutate()

df.diamonds_ideal <- mutate(df.diamonds_ideal, price_per_carat = price/carat)

head(df.diamonds_ideal)
## # A tibble: 6 x 6
##   carat   cut color price clarity price_per_carat
##   <dbl> <ord> <ord> <int>   <ord>           <dbl>
## 1  0.23 Ideal     E   326     SI2        1417.391
## 2  0.23 Ideal     J   340     VS1        1478.261
## 3  0.31 Ideal     J   344     SI2        1109.677
## 4  0.30 Ideal     I   348     SI2        1160.000
## 5  0.33 Ideal     I   403     SI2        1221.212
## 6  0.33 Ideal     I   403     SI2        1221.212

arrange()

df.diamonds_ideal <- arrange(df.diamonds_ideal, carat)

head(df.diamonds_ideal)
## # A tibble: 6 x 6
##   carat   cut color price clarity price_per_carat
##   <dbl> <ord> <ord> <int>   <ord>           <dbl>
## 1  0.20 Ideal     E   367     VS2        1835.000
## 2  0.20 Ideal     D   367     VS2        1835.000
## 3  0.20 Ideal     E   367     VS2        1835.000
## 4  0.23 Ideal     E   326     SI2        1417.391
## 5  0.23 Ideal     J   340     VS1        1478.261
## 6  0.23 Ideal     G   404     VS1        1756.522

summarize()

summarize(df.diamonds_ideal, avg_price = mean(price, na.rm = TRUE) )
## # A tibble: 1 x 1
##   avg_price
##       <dbl>
## 1  3457.542

Chaining dplyr verbs with %>% (pipe)

df.diamonds_ideal_chained <- diamonds %>%
                              filter(cut=="Ideal") %>%
                              select(carat, cut, color, price, clarity) %>%
                              mutate(price_per_carat = price/carat)
head(df.diamonds_ideal_chained)
## # A tibble: 6 x 6
##   carat   cut color price clarity price_per_carat
##   <dbl> <ord> <ord> <int>   <ord>           <dbl>
## 1  0.23 Ideal     E   326     SI2        1417.391
## 2  0.23 Ideal     J   340     VS1        1478.261
## 3  0.31 Ideal     J   344     SI2        1109.677
## 4  0.30 Ideal     I   348     SI2        1160.000
## 5  0.33 Ideal     I   403     SI2        1221.212
## 6  0.33 Ideal     I   403     SI2        1221.212

rapid plotting

diamonds %>%                                        # Start with the 'diamonds' dataset
  filter(cut == "Ideal") %>%                        # Then, filter down to rows where cut == Ideal
  ggplot(aes(x=color,y=price)) +                     # Then, plot using ggplot
    geom_boxplot()                                  #  with and create a boxplot

Twitter mining and social network analysis

Here I will introduce some code to at people talking around hashtags (mostly scientific conferences). You can get the code at https://github.com/thomas-keller/tweet-conf . A more extended explanation of these analyses is at my website http://thomas-keller.github.io/articles/I-analyzed-evolution-2016-twitter-and-you-can-too-for-other-conferences/ .

In this example I’m just using a csv of parsed tweets rather than downloading something to introduce one less thing than can go wrong.

library(twitteR)
library(ROAuth)
library(tidytext)
library(ggplot2)
library(wordcloud)
library(dplyr)

#formatTwDate from SmappR https://github.com/SMAPPNYU/smappR/
#By Pablo Barbera http://pablobarbera.com/ and others

formatTwDate <- function(datestring, format="datetime"){
  if (format=="datetime"){
    date <- as.POSIXct(datestring, format="%a %b %d %H:%M:%S %z %Y")
  }
  if (format=="date"){
    date <- as.Date(datestring, format="%a %b %d %H:%M:%S %z %Y")
  }   
  return(date)
}

#search along the hashtag (can be have multiple hashtags if you want/need)
#convert to dataframe
hashtag<-'#SciPy2016'
confname<-substr(hashtag,2,nchar(hashtag))
#tw_list <- searchTwitter(hashtag, n = 1e4, since = '2016-08-3')# , until='2016-07-14') #~5k tweets
tw_df<-read_csv('scipy2016.csv') 
#tw_df<-twListToDF(tw_list)
tw_df<-unique(tw_df)
filename<-paste0(confname,".csv")
#write.csv(tw_df,file=filename,row.names=F)

Here is a simple wordcloud that demonstrates using tidytext (unnest_tokens) and dplyr chaining. The pipe %>% basically feeds the object on the left to the function on the right.

users<-data.frame(word=tolower(tw_df$screenName),lexicon=rep('whatevs',nrow(tw_df)))
#breaks down tweets into words for tidy (word) level analyses
tidy_tw<-tw_df %>% unnest_tokens(word,text)

#removes uninformatives words / ones that oversaturate wordcloud
tw_stop<-data.frame(word=c(confname,tolower(confname),'htt','25','http','amp','gt','t.c','rt','https','t.co','___','1','2','3','4','5','6','7','8','9',"i\'m",'15','30','45','00','10'),lexicon='whatevs')
data("stop_words")
tidy_cloud <- tidy_tw %>%
 anti_join(tw_stop) %>%
  anti_join(stop_words) %>%
  anti_join(users)

print(tidy_cloud %>% count(word, sort = TRUE)) 
## # A tibble: 2,532 x 2
##          word     n
##         <chr> <int>
## 1      python   543
## 2        talk   518
## 3  jupyterlab   292
## 4        tool   220
## 5       scipy   196
## 6     jupyter   180
## 7        data   171
## 8       alpha   168
## 9        blog   167
## 10       post   167
## # ... with 2,522 more rows
tidy_cloud %>%
 count(word) %>%
 with(wordcloud(word, n,max.words = 100,colors=brewer.pal(8,'Dark2')))
## Warning in wordcloud(word, n, max.words = 100, colors = brewer.pal(8,
## "Dark2")): python could not be fit on page. It will not be plotted.