There are 5 dplyr verbs

dplyr examples come from http://sharpsightlabs.com/blog/2014/12/11/dplyr-intro-data-manipulation-with-r/

filter

df.diamonds_ideal <- filter(diamonds, cut=="Ideal")
head(df.diamonds_ideal)

## # A tibble: 6 x 10
##   carat   cut color clarity depth table price     x     y     z
##   <dbl> <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
## 2  0.23 Ideal     J     VS1  62.8    56   340  3.93  3.90  2.46
## 3  0.31 Ideal     J     SI2  62.2    54   344  4.35  4.37  2.71
## 4  0.30 Ideal     I     SI2  62.0    54   348  4.31  4.34  2.68
## 5  0.33 Ideal     I     SI2  61.8    55   403  4.49  4.51  2.78
## 6  0.33 Ideal     I     SI2  61.2    56   403  4.49  4.50  2.75

select()

df.diamonds_ideal <- select(df.diamonds_ideal, carat, cut, color, price, clarity)

head(df.diamonds_ideal)

## # A tibble: 6 x 5
##   carat   cut color price clarity
##   <dbl> <ord> <ord> <int>   <ord>
## 1  0.23 Ideal     E   326     SI2
## 2  0.23 Ideal     J   340     VS1
## 3  0.31 Ideal     J   344     SI2
## 4  0.30 Ideal     I   348     SI2
## 5  0.33 Ideal     I   403     SI2
## 6  0.33 Ideal     I   403     SI2

mutate()

df.diamonds_ideal <- mutate(df.diamonds_ideal, price_per_carat = price/carat)

head(df.diamonds_ideal)

## # A tibble: 6 x 6
##   carat   cut color price clarity price_per_carat
##   <dbl> <ord> <ord> <int>   <ord>           <dbl>
## 1  0.23 Ideal     E   326     SI2        1417.391
## 2  0.23 Ideal     J   340     VS1        1478.261
## 3  0.31 Ideal     J   344     SI2        1109.677
## 4  0.30 Ideal     I   348     SI2        1160.000
## 5  0.33 Ideal     I   403     SI2        1221.212
## 6  0.33 Ideal     I   403     SI2        1221.212

arrange()

df.diamonds_ideal <- arrange(df.diamonds_ideal, carat)

head(df.diamonds_ideal)

## # A tibble: 6 x 6
##   carat   cut color price clarity price_per_carat
##   <dbl> <ord> <ord> <int>   <ord>           <dbl>
## 1  0.20 Ideal     E   367     VS2        1835.000
## 2  0.20 Ideal     D   367     VS2        1835.000
## 3  0.20 Ideal     E   367     VS2        1835.000
## 4  0.23 Ideal     E   326     SI2        1417.391
## 5  0.23 Ideal     J   340     VS1        1478.261
## 6  0.23 Ideal     G   404     VS1        1756.522

summarize()

summarize(df.diamonds_ideal, avg_price = mean(price, na.rm = TRUE) )

## # A tibble: 1 x 1
##   avg_price
##       <dbl>
## 1  3457.542

Chaining dplyr verbs with %>% (pipe)

df.diamonds_ideal_chained <- diamonds %>%
                              filter(cut=="Ideal") %>%
                              select(carat, cut, color, price, clarity) %>%
                              mutate(price_per_carat = price/carat)
head(df.diamonds_ideal_chained)

## # A tibble: 6 x 6
##   carat   cut color price clarity price_per_carat
##   <dbl> <ord> <ord> <int>   <ord>           <dbl>
## 1  0.23 Ideal     E   326     SI2        1417.391
## 2  0.23 Ideal     J   340     VS1        1478.261
## 3  0.31 Ideal     J   344     SI2        1109.677
## 4  0.30 Ideal     I   348     SI2        1160.000
## 5  0.33 Ideal     I   403     SI2        1221.212
## 6  0.33 Ideal     I   403     SI2        1221.212

rapid plotting

diamonds %>%                                        # Start with the 'diamonds' dataset
  filter(cut == "Ideal") %>%                        # Then, filter down to rows where cut == Ideal
  ggplot(aes(x=color,y=price)) +                     # Then, plot using ggplot
    geom_boxplot()                                  #  with and create a boxplot

Twitter mining and social network analysis

Here I will introduce some code to at people talking around hashtags (mostly scientific conferences). You can get the code at https://github.com/thomas-keller/tweet-conf . A more extended explanation of these analyses is at my website http://thomas-keller.github.io/articles/I-analyzed-evolution-2016-twitter-and-you-can-too-for-other-conferences/ .

In this example I’m just using a csv of parsed tweets rather than downloading something to introduce one less thing than can go wrong.

library(twitteR)
library(ROAuth)
library(tidytext)
library(ggplot2)
library(wordcloud)
library(dplyr)

#formatTwDate from SmappR https://github.com/SMAPPNYU/smappR/
#By Pablo Barbera http://pablobarbera.com/ and others

formatTwDate <- function(datestring, format="datetime"){
  if (format=="datetime"){
    date <- as.POSIXct(datestring, format="%a %b %d %H:%M:%S %z %Y")
  }
  if (format=="date"){
    date <- as.Date(datestring, format="%a %b %d %H:%M:%S %z %Y")
  }   
  return(date)
}

#search along the hashtag (can be have multiple hashtags if you want/need)
#convert to dataframe
hashtag<-'#SciPy2016'
confname<-substr(hashtag,2,nchar(hashtag))
#tw_list <- searchTwitter(hashtag, n = 1e4, since = '2016-08-3')# , until='2016-07-14') #~5k tweets
tw_df<-read_csv('scipy2016.csv') 
#tw_df<-twListToDF(tw_list)
tw_df<-unique(tw_df)
filename<-paste0(confname,".csv")
#write.csv(tw_df,file=filename,row.names=F)

Here is a simple wordcloud that demonstrates using tidytext (unnest_tokens) and dplyr chaining. The pipe %>% basically feeds the object on the left to the function on the right.

users<-data.frame(word=tolower(tw_df$screenName),lexicon=rep('whatevs',nrow(tw_df)))
#breaks down tweets into words for tidy (word) level analyses
tidy_tw<-tw_df %>% unnest_tokens(word,text)

#removes uninformatives words / ones that oversaturate wordcloud
tw_stop<-data.frame(word=c(confname,tolower(confname),'htt','25','http','amp','gt','t.c','rt','https','t.co','___','1','2','3','4','5','6','7','8','9',"i\'m",'15','30','45','00','10'),lexicon='whatevs')
data("stop_words")
tidy_cloud <- tidy_tw %>%
 anti_join(tw_stop) %>%
  anti_join(stop_words) %>%
  anti_join(users)

print(tidy_cloud %>% count(word, sort = TRUE))

## # A tibble: 2,532 x 2
##          word     n
##         <chr> <int>
## 1      python   543
## 2        talk   518
## 3  jupyterlab   292
## 4        tool   220
## 5       scipy   196
## 6     jupyter   180
## 7        data   171
## 8       alpha   168
## 9        blog   167
## 10       post   167
## # ... with 2,522 more rows

tidy_cloud %>%
 count(word) %>%
 with(wordcloud(word, n,max.words = 100,colors=brewer.pal(8,'Dark2')))

## Warning in wordcloud(word, n, max.words = 100, colors = brewer.pal(8,
## "Dark2")): python could not be fit on page. It will not be plotted.

Now we want to get a sense of who are the most active users of this hashtag. Here I’m introducing some ways to modify the default ggplot output

user.tweets <- as.data.frame(table(tw_df$screenName))
names(user.tweets) <- c("User", "Tweets")

# Order the table by number of tweets per user & do some culling
user.tweets <- user.tweets[with(user.tweets, order(-Tweets)), ]
user.tweets_fig<-user.tweets[user.tweets$Tweets>2,]
user.tweets_fig<-user.tweets_fig[1:40,]

#make the plot for the top 40 or so
#I normally hate the x and y guide lines, but they serve a purpose with the extreme skew and names
ggplot(data=user.tweets_fig, aes(x=reorder(User, Tweets), y=Tweets)) +
  geom_bar(stat='identity') +
  coord_flip() +
  scale_y_continuous("Tweets") +
  scale_x_discrete("User") +
  labs(title = paste(hashtag, " tweets per user")) +
  theme_bw() +
  theme(axis.title = element_text(face="bold"), axis.text.y = element_text(size=6))

Twitter sentiment

This is a simple “bag of words” sentiment analysis

tidy_tw$created<-formatTwDate(tidy_tw$created)
tw_df$created<-formatTwDate(tw_df$created)


library(tidyr)
bing <- sentiments %>%
 filter(lexicon == "bing") %>%
 select(-score)

conf_sent <- tidy_tw %>%
 inner_join(bing) %>%
 count(id, sentiment) %>% 
 spread(sentiment, n, fill = 0) %>%
 mutate(sentiment = positive - negative) %>%
 inner_join(tw_df[,c(5,8)]) #join on id and created

library(cowplot)
library(scales)
library(lubridate)

#adjust time zone of tweets with lubridate
conf_sent$created<-ymd_hms(conf_sent$created,tz='EST')

#Example could include label, but don't have time to figure out what is driving
#inflection points of moods during these other conferences
df_labels<-data.frame(times=strptime(c("2016-07-13 12:00:00","2016-07-15 0:00:00","2016-07-16 16:30:00","2016-07-18 6:30:00"),"%Y-%m-%d %H:%M:%S"),
                      labels=c("it begins!\nmixers for all","science cafe\nfunny-man",'final day\nmixer stuff','that was pretty\ngood reflection'),
                      y=c(1.5,1.0,1.0,1.0))
ggplot(conf_sent, aes(created, sentiment)) +
 geom_smooth() + xlab("tweet time") + ylab("tweet sentiment")+
 scale_x_datetime(breaks = date_breaks("day")) + background_grid(major = "xy", minor = "none") +
 theme(axis.text.x=element_text(angle=315,vjust=.6))+
  #geom_text(data=df_labels,aes(x=times,y=y,label=labels),size=4)+
  ggtitle(paste(hashtag,"positive or negative emotions (think first order ~vibe of conf.)"))

 #coord_cartesian(ylim=c(-.5,1.2)) #+geom_text(data=df_labels,aes(x=times,y=y,label=labels),size=4)

tweet retweet network

library(twitteR)
library(tidytext)
library(ggplot2)
library(dplyr)
library(igraph)
library(stringr)

hm<-sort(table(tw_df$screenName))
outdf<-data.frame(screen_name=names(hm),num_tweets=hm)[,c(1,3)]
outdf<-outdf[order(outdf[,2],decreasing=T),]

#OK, start of new code to develop RT network
#code (regex especially!!!) used liberally from
# https://sites.google.com/site/miningtwitter/questions/user-tweets/who-retweet

#TODO:
#replace retweet network construction (not plotting)
#with https://github.com/nfahlgren/conference_twitter_stats/blob/master/retweet_network_generic.R
#it's cleaner and doesn't rely on regex horrors I don't understand

rt_net<-grep("(RT|via)((?:\\b\\W*@\\w+)+)", tw_df$text, 
             ignore.case=TRUE,value=TRUE)
rt_neti<-grep("(RT|via)((?:\\b\\W*@\\w+)+)", tw_df$text, 
              ignore.case=TRUE)

#next, create list to store user names
who_retweet <- as.list(1:length(rt_net))
who_post <- as.list(1:length(rt_net))

# for loop
for (i in 1:length(rt_net))
{ 
  # get tweet with retweet entity
  #nrow= ???
  twit <- tw_df[rt_neti[i],]
  # get retweet source 
  poster<-str_extract_all(twit$text,"(RT|via)((?:\\b\\W*@\\w+)+)")  
  #remove ':'
  poster <- gsub(":", "", unlist(poster)) 
  # name of retweeted user
  who_post[[i]] <- gsub("(RT @|via @)", "", poster, ignore.case=TRUE) 
  # name of retweeting user 
  who_retweet[[i]] <- rep(twit$screenName, length(poster)) 
}

# unlist
who_post <- unlist(who_post)
who_retweet <- unlist(who_retweet)

####
#Preprocessing the dataframes as as contacts to something
#igraph likes

#I guess I need an edge aesthetic for ggraph to paint with
retweeter_poster <- data.frame(from=who_retweet, to=who_post,retweets=1)

#filters out some bad parsing and users who arent in the node graph
#node_df has the screen_name and number of tweets per user, which will serve as the vertex/node dataframe
#in igraph speak
node_df<-outdf
names(node_df)<-c("id","num_tweets")
#This step #REALLLY IMPORTANT# for plotting purposes, determines how dense the network is
#need to tune based on how big you want your input network is
node_df2<-droplevels(node_df[1:50,]) #selecting only the top 50 posting from #evol2016 for plotting purposes
filt_rt_post<-retweeter_poster[retweeter_poster$from %in% node_df2$id & retweeter_poster$to %in% node_df2$id,]
filt_rt_post<-droplevels(filt_rt_post) #ditch all those fleshbags that had to talk to people instead of tweeting
head(filt_rt_post)

##              from           to retweets
## 43        scopatz       dotsdl        1
## 48      SciPyConf  JackieKazil        1
## 51      SciPyConf      ericmjl        1
## 56    chendaniely     jhamrick        1
## 60     geo_leeman dopplershift        1
## 61 jnuneziglesias    SciPyConf        1

#this creates a directed graph with vertex/node info on num_tweets, and edge info on retweets
rt_graph<-graph_from_data_frame(d=droplevels(filt_rt_post),vertices=droplevels(node_df2),directed=T)

#simplify the graph to remove any possible self retweets since now twitter is dumb any allows that
#and any multiple edges
#have to wait a couple seconds to let graph be generated before simplify call
#merge all the multiple rts a person has into one edge to simplify visualization
rt_graph<-simplify(rt_graph,remove.multiple=T,remove.loops=TRUE,edge.attr.comb='sum')

ggraph network

library(ggraph)
library(ggplot2)
#jpeg('evol2016_top50_twitter_network.jpg',width=960,height=960,pointsize=12)
ggraph(rt_graph,'igraph',algorithm='kk')+
  geom_edge_fan(aes(alpha=retweet),edge_alpha=0.075)+
  geom_node_point(aes(size=num_tweets))+
  geom_node_text(aes(label=name,vjust=-1.5))+
  ggforce::theme_no_axes()+
  theme(legend.position=c(.08,.88))

intro-r-ggplot

Thomas E. Keller

August 8, 2016

R Markdown

R Markdown & life advice to past self analyzing datasets