R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this as seen below

R Markdown & life advice to past self analyzing datasets

Trust me when I say it will make your life much easier if you keep a running document with your analyses and explanations of what you did and why you were doing it, especially if you are juggling multiple projects and need to put something down for a month or so.

I in the past had the “organisational” strategy of having a giant .R file with all my R code for a project, and I would occaisionally update a separate word document with some of the better plots and try to write them up.

This is a bad strategy, don’t be past me

Having a single document where you mix code, figures, and explanations will make things much easier.

library(ghostr)
library(nycflights13)
library(ggplot2)
library(readr)
library(dplyr)
data('ghost_sightings')
data('flights')

There are 5 dplyr verbs

dplyr examples come from http://sharpsightlabs.com/blog/2014/12/11/dplyr-intro-data-manipulation-with-r/

filter

df.diamonds_ideal <- filter(diamonds, cut=="Ideal")
head(df.diamonds_ideal)
## # A tibble: 6 x 10
##   carat   cut color clarity depth table price     x     y     z
##   <dbl> <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
## 2  0.23 Ideal     J     VS1  62.8    56   340  3.93  3.90  2.46
## 3  0.31 Ideal     J     SI2  62.2    54   344  4.35  4.37  2.71
## 4  0.30 Ideal     I     SI2  62.0    54   348  4.31  4.34  2.68
## 5  0.33 Ideal     I     SI2  61.8    55   403  4.49  4.51  2.78
## 6  0.33 Ideal     I     SI2  61.2    56   403  4.49  4.50  2.75

select()

df.diamonds_ideal <- select(df.diamonds_ideal, carat, cut, color, price, clarity)

head(df.diamonds_ideal)
## # A tibble: 6 x 5
##   carat   cut color price clarity
##   <dbl> <ord> <ord> <int>   <ord>
## 1  0.23 Ideal     E   326     SI2
## 2  0.23 Ideal     J   340     VS1
## 3  0.31 Ideal     J   344     SI2
## 4  0.30 Ideal     I   348     SI2
## 5  0.33 Ideal     I   403     SI2
## 6  0.33 Ideal     I   403     SI2

mutate()

df.diamonds_ideal <- mutate(df.diamonds_ideal, price_per_carat = price/carat)

head(df.diamonds_ideal)
## # A tibble: 6 x 6
##   carat   cut color price clarity price_per_carat
##   <dbl> <ord> <ord> <int>   <ord>           <dbl>
## 1  0.23 Ideal     E   326     SI2        1417.391
## 2  0.23 Ideal     J   340     VS1        1478.261
## 3  0.31 Ideal     J   344     SI2        1109.677
## 4  0.30 Ideal     I   348     SI2        1160.000
## 5  0.33 Ideal     I   403     SI2        1221.212
## 6  0.33 Ideal     I   403     SI2        1221.212

arrange()

df.diamonds_ideal <- arrange(df.diamonds_ideal, carat)

head(df.diamonds_ideal)
## # A tibble: 6 x 6
##   carat   cut color price clarity price_per_carat
##   <dbl> <ord> <ord> <int>   <ord>           <dbl>
## 1  0.20 Ideal     E   367     VS2        1835.000
## 2  0.20 Ideal     D   367     VS2        1835.000
## 3  0.20 Ideal     E   367     VS2        1835.000
## 4  0.23 Ideal     E   326     SI2        1417.391
## 5  0.23 Ideal     J   340     VS1        1478.261
## 6  0.23 Ideal     G   404     VS1        1756.522

summarize()

summarize(df.diamonds_ideal, avg_price = mean(price, na.rm = TRUE) )
## # A tibble: 1 x 1
##   avg_price
##       <dbl>
## 1  3457.542

Chaining dplyr verbs with %>% (pipe)

df.diamonds_ideal_chained <- diamonds %>%
                              filter(cut=="Ideal") %>%
                              select(carat, cut, color, price, clarity) %>%
                              mutate(price_per_carat = price/carat)
head(df.diamonds_ideal_chained)
## # A tibble: 6 x 6
##   carat   cut color price clarity price_per_carat
##   <dbl> <ord> <ord> <int>   <ord>           <dbl>
## 1  0.23 Ideal     E   326     SI2        1417.391
## 2  0.23 Ideal     J   340     VS1        1478.261
## 3  0.31 Ideal     J   344     SI2        1109.677
## 4  0.30 Ideal     I   348     SI2        1160.000
## 5  0.33 Ideal     I   403     SI2        1221.212
## 6  0.33 Ideal     I   403     SI2        1221.212

rapid plotting

diamonds %>%                                        # Start with the 'diamonds' dataset
  filter(cut == "Ideal") %>%                        # Then, filter down to rows where cut == Ideal
  ggplot(aes(x=color,y=price)) +                     # Then, plot using ggplot
    geom_boxplot()                                  #  with and create a boxplot

Twitter mining and social network analysis

Here I will introduce some code to at people talking around hashtags (mostly scientific conferences). You can get the code at https://github.com/thomas-keller/tweet-conf . A more extended explanation of these analyses is at my website http://thomas-keller.github.io/articles/I-analyzed-evolution-2016-twitter-and-you-can-too-for-other-conferences/ .

In this example I’m just using a csv of parsed tweets rather than downloading something to introduce one less thing than can go wrong.

library(twitteR)
library(ROAuth)
library(tidytext)
library(ggplot2)
library(wordcloud)
library(dplyr)

#formatTwDate from SmappR https://github.com/SMAPPNYU/smappR/
#By Pablo Barbera http://pablobarbera.com/ and others

formatTwDate <- function(datestring, format="datetime"){
  if (format=="datetime"){
    date <- as.POSIXct(datestring, format="%a %b %d %H:%M:%S %z %Y")
  }
  if (format=="date"){
    date <- as.Date(datestring, format="%a %b %d %H:%M:%S %z %Y")
  }   
  return(date)
}

#search along the hashtag (can be have multiple hashtags if you want/need)
#convert to dataframe
hashtag<-'#SciPy2016'
confname<-substr(hashtag,2,nchar(hashtag))
#tw_list <- searchTwitter(hashtag, n = 1e4, since = '2016-08-3')# , until='2016-07-14') #~5k tweets
tw_df<-read_csv('scipy2016.csv') 
#tw_df<-twListToDF(tw_list)
tw_df<-unique(tw_df)
filename<-paste0(confname,".csv")
#write.csv(tw_df,file=filename,row.names=F)

Here is a simple wordcloud that demonstrates using tidytext (unnest_tokens) and dplyr chaining. The pipe %>% basically feeds the object on the left to the function on the right.

users<-data.frame(word=tolower(tw_df$screenName),lexicon=rep('whatevs',nrow(tw_df)))
#breaks down tweets into words for tidy (word) level analyses
tidy_tw<-tw_df %>% unnest_tokens(word,text)

#removes uninformatives words / ones that oversaturate wordcloud
tw_stop<-data.frame(word=c(confname,tolower(confname),'htt','25','http','amp','gt','t.c','rt','https','t.co','___','1','2','3','4','5','6','7','8','9',"i\'m",'15','30','45','00','10'),lexicon='whatevs')
data("stop_words")
tidy_cloud <- tidy_tw %>%
 anti_join(tw_stop) %>%
  anti_join(stop_words) %>%
  anti_join(users)

print(tidy_cloud %>% count(word, sort = TRUE)) 
## # A tibble: 2,532 x 2
##          word     n
##         <chr> <int>
## 1      python   543
## 2        talk   518
## 3  jupyterlab   292
## 4        tool   220
## 5       scipy   196
## 6     jupyter   180
## 7        data   171
## 8       alpha   168
## 9        blog   167
## 10       post   167
## # ... with 2,522 more rows
tidy_cloud %>%
 count(word) %>%
 with(wordcloud(word, n,max.words = 100,colors=brewer.pal(8,'Dark2')))
## Warning in wordcloud(word, n, max.words = 100, colors = brewer.pal(8,
## "Dark2")): python could not be fit on page. It will not be plotted.

Now we want to get a sense of who are the most active users of this hashtag. Here I’m introducing some ways to modify the default ggplot output

user.tweets <- as.data.frame(table(tw_df$screenName))
names(user.tweets) <- c("User", "Tweets")

# Order the table by number of tweets per user & do some culling
user.tweets <- user.tweets[with(user.tweets, order(-Tweets)), ]
user.tweets_fig<-user.tweets[user.tweets$Tweets>2,]
user.tweets_fig<-user.tweets_fig[1:40,]

#make the plot for the top 40 or so
#I normally hate the x and y guide lines, but they serve a purpose with the extreme skew and names
ggplot(data=user.tweets_fig, aes(x=reorder(User, Tweets), y=Tweets)) +
  geom_bar(stat='identity') +
  coord_flip() +
  scale_y_continuous("Tweets") +
  scale_x_discrete("User") +
  labs(title = paste(hashtag, " tweets per user")) +
  theme_bw() +
  theme(axis.title = element_text(face="bold"), axis.text.y = element_text(size=6))

Twitter sentiment

This is a simple “bag of words” sentiment analysis

tidy_tw$created<-formatTwDate(tidy_tw$created)
tw_df$created<-formatTwDate(tw_df$created)


library(tidyr)
bing <- sentiments %>%
 filter(lexicon == "bing") %>%
 select(-score)

conf_sent <- tidy_tw %>%
 inner_join(bing) %>%
 count(id, sentiment) %>% 
 spread(sentiment, n, fill = 0) %>%
 mutate(sentiment = positive - negative) %>%
 inner_join(tw_df[,c(5,8)]) #join on id and created

library(cowplot)
library(scales)
library(lubridate)

#adjust time zone of tweets with lubridate
conf_sent$created<-ymd_hms(conf_sent$created,tz='EST')

#Example could include label, but don't have time to figure out what is driving
#inflection points of moods during these other conferences
df_labels<-data.frame(times=strptime(c("2016-07-13 12:00:00","2016-07-15 0:00:00","2016-07-16 16:30:00","2016-07-18 6:30:00"),"%Y-%m-%d %H:%M:%S"),
                      labels=c("it begins!\nmixers for all","science cafe\nfunny-man",'final day\nmixer stuff','that was pretty\ngood reflection'),
                      y=c(1.5,1.0,1.0,1.0))
ggplot(conf_sent, aes(created, sentiment)) +
 geom_smooth() + xlab("tweet time") + ylab("tweet sentiment")+
 scale_x_datetime(breaks = date_breaks("day")) + background_grid(major = "xy", minor = "none") +
 theme(axis.text.x=element_text(angle=315,vjust=.6))+
  #geom_text(data=df_labels,aes(x=times,y=y,label=labels),size=4)+
  ggtitle(paste(hashtag,"positive or negative emotions (think first order ~vibe of conf.)"))

 #coord_cartesian(ylim=c(-.5,1.2)) #+geom_text(data=df_labels,aes(x=times,y=y,label=labels),size=4)

tweet retweet network

library(twitteR)
library(tidytext)
library(ggplot2)
library(dplyr)
library(igraph)
library(stringr)

hm<-sort(table(tw_df$screenName))
outdf<-data.frame(screen_name=names(hm),num_tweets=hm)[,c(1,3)]
outdf<-outdf[order(outdf[,2],decreasing=T),]

#OK, start of new code to develop RT network
#code (regex especially!!!) used liberally from
# https://sites.google.com/site/miningtwitter/questions/user-tweets/who-retweet

#TODO:
#replace retweet network construction (not plotting)
#with https://github.com/nfahlgren/conference_twitter_stats/blob/master/retweet_network_generic.R
#it's cleaner and doesn't rely on regex horrors I don't understand

rt_net<-grep("(RT|via)((?:\\b\\W*@\\w+)+)", tw_df$text, 
             ignore.case=TRUE,value=TRUE)
rt_neti<-grep("(RT|via)((?:\\b\\W*@\\w+)+)", tw_df$text, 
              ignore.case=TRUE)

#next, create list to store user names
who_retweet <- as.list(1:length(rt_net))
who_post <- as.list(1:length(rt_net))

# for loop
for (i in 1:length(rt_net))
{ 
  # get tweet with retweet entity
  #nrow= ???
  twit <- tw_df[rt_neti[i],]
  # get retweet source 
  poster<-str_extract_all(twit$text,"(RT|via)((?:\\b\\W*@\\w+)+)")  
  #remove ':'
  poster <- gsub(":", "", unlist(poster)) 
  # name of retweeted user
  who_post[[i]] <- gsub("(RT @|via @)", "", poster, ignore.case=TRUE) 
  # name of retweeting user 
  who_retweet[[i]] <- rep(twit$screenName, length(poster)) 
}

# unlist
who_post <- unlist(who_post)
who_retweet <- unlist(who_retweet)

####
#Preprocessing the dataframes as as contacts to something
#igraph likes

#I guess I need an edge aesthetic for ggraph to paint with
retweeter_poster <- data.frame(from=who_retweet, to=who_post,retweets=1)

#filters out some bad parsing and users who arent in the node graph
#node_df has the screen_name and number of tweets per user, which will serve as the vertex/node dataframe
#in igraph speak
node_df<-outdf
names(node_df)<-c("id","num_tweets")
#This step #REALLLY IMPORTANT# for plotting purposes, determines how dense the network is
#need to tune based on how big you want your input network is
node_df2<-droplevels(node_df[1:50,]) #selecting only the top 50 posting from #evol2016 for plotting purposes
filt_rt_post<-retweeter_poster[retweeter_poster$from %in% node_df2$id & retweeter_poster$to %in% node_df2$id,]
filt_rt_post<-droplevels(filt_rt_post) #ditch all those fleshbags that had to talk to people instead of tweeting
head(filt_rt_post)
##              from           to retweets
## 43        scopatz       dotsdl        1
## 48      SciPyConf  JackieKazil        1
## 51      SciPyConf      ericmjl        1
## 56    chendaniely     jhamrick        1
## 60     geo_leeman dopplershift        1
## 61 jnuneziglesias    SciPyConf        1
#this creates a directed graph with vertex/node info on num_tweets, and edge info on retweets
rt_graph<-graph_from_data_frame(d=droplevels(filt_rt_post),vertices=droplevels(node_df2),directed=T)

#simplify the graph to remove any possible self retweets since now twitter is dumb any allows that
#and any multiple edges
#have to wait a couple seconds to let graph be generated before simplify call
#merge all the multiple rts a person has into one edge to simplify visualization
rt_graph<-simplify(rt_graph,remove.multiple=T,remove.loops=TRUE,edge.attr.comb='sum')

ggraph network

library(ggraph)
library(ggplot2)
#jpeg('evol2016_top50_twitter_network.jpg',width=960,height=960,pointsize=12)
ggraph(rt_graph,'igraph',algorithm='kk')+
  geom_edge_fan(aes(alpha=retweet),edge_alpha=0.075)+
  geom_node_point(aes(size=num_tweets))+
  geom_node_text(aes(label=name,vjust=-1.5))+
  ggforce::theme_no_axes()+
  theme(legend.position=c(.08,.88))

network degree distribution

deg.dist <-degree_distribution(rt_graph, cumulative=T, mode="all")
deg_df<-data.frame(deg=0:max(degree(rt_graph)),cum_freq=1-deg.dist) 
qplot(deg,cum_freq,data=deg_df,xlab="Degree",ylab="Cumulative Frequency")