This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this as seen below
Trust me when I say it will make your life much easier if you keep a running document with your analyses and explanations of what you did and why you were doing it, especially if you are juggling multiple projects and need to put something down for a month or so.
I in the past had the “organisational” strategy of having a giant .R file with all my R code for a project, and I would occaisionally update a separate word document with some of the better plots and try to write them up.
This is a bad strategy, don’t be past me
Having a single document where you mix code, figures, and explanations will make things much easier.
library(ghostr)
library(nycflights13)
library(ggplot2)
library(readr)
library(dplyr)
data('ghost_sightings')
data('flights')
dplyr examples come from http://sharpsightlabs.com/blog/2014/12/11/dplyr-intro-data-manipulation-with-r/
df.diamonds_ideal <- filter(diamonds, cut=="Ideal")
head(df.diamonds_ideal)
## # A tibble: 6 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.23 Ideal J VS1 62.8 56 340 3.93 3.90 2.46
## 3 0.31 Ideal J SI2 62.2 54 344 4.35 4.37 2.71
## 4 0.30 Ideal I SI2 62.0 54 348 4.31 4.34 2.68
## 5 0.33 Ideal I SI2 61.8 55 403 4.49 4.51 2.78
## 6 0.33 Ideal I SI2 61.2 56 403 4.49 4.50 2.75
df.diamonds_ideal <- select(df.diamonds_ideal, carat, cut, color, price, clarity)
head(df.diamonds_ideal)
## # A tibble: 6 x 5
## carat cut color price clarity
## <dbl> <ord> <ord> <int> <ord>
## 1 0.23 Ideal E 326 SI2
## 2 0.23 Ideal J 340 VS1
## 3 0.31 Ideal J 344 SI2
## 4 0.30 Ideal I 348 SI2
## 5 0.33 Ideal I 403 SI2
## 6 0.33 Ideal I 403 SI2
df.diamonds_ideal <- mutate(df.diamonds_ideal, price_per_carat = price/carat)
head(df.diamonds_ideal)
## # A tibble: 6 x 6
## carat cut color price clarity price_per_carat
## <dbl> <ord> <ord> <int> <ord> <dbl>
## 1 0.23 Ideal E 326 SI2 1417.391
## 2 0.23 Ideal J 340 VS1 1478.261
## 3 0.31 Ideal J 344 SI2 1109.677
## 4 0.30 Ideal I 348 SI2 1160.000
## 5 0.33 Ideal I 403 SI2 1221.212
## 6 0.33 Ideal I 403 SI2 1221.212
df.diamonds_ideal <- arrange(df.diamonds_ideal, carat)
head(df.diamonds_ideal)
## # A tibble: 6 x 6
## carat cut color price clarity price_per_carat
## <dbl> <ord> <ord> <int> <ord> <dbl>
## 1 0.20 Ideal E 367 VS2 1835.000
## 2 0.20 Ideal D 367 VS2 1835.000
## 3 0.20 Ideal E 367 VS2 1835.000
## 4 0.23 Ideal E 326 SI2 1417.391
## 5 0.23 Ideal J 340 VS1 1478.261
## 6 0.23 Ideal G 404 VS1 1756.522
summarize(df.diamonds_ideal, avg_price = mean(price, na.rm = TRUE) )
## # A tibble: 1 x 1
## avg_price
## <dbl>
## 1 3457.542
df.diamonds_ideal_chained <- diamonds %>%
filter(cut=="Ideal") %>%
select(carat, cut, color, price, clarity) %>%
mutate(price_per_carat = price/carat)
head(df.diamonds_ideal_chained)
## # A tibble: 6 x 6
## carat cut color price clarity price_per_carat
## <dbl> <ord> <ord> <int> <ord> <dbl>
## 1 0.23 Ideal E 326 SI2 1417.391
## 2 0.23 Ideal J 340 VS1 1478.261
## 3 0.31 Ideal J 344 SI2 1109.677
## 4 0.30 Ideal I 348 SI2 1160.000
## 5 0.33 Ideal I 403 SI2 1221.212
## 6 0.33 Ideal I 403 SI2 1221.212
diamonds %>% # Start with the 'diamonds' dataset
filter(cut == "Ideal") %>% # Then, filter down to rows where cut == Ideal
ggplot(aes(x=color,y=price)) + # Then, plot using ggplot
geom_boxplot() # with and create a boxplot
This is a simple “bag of words” sentiment analysis
tidy_tw$created<-formatTwDate(tidy_tw$created)
tw_df$created<-formatTwDate(tw_df$created)
library(tidyr)
bing <- sentiments %>%
filter(lexicon == "bing") %>%
select(-score)
conf_sent <- tidy_tw %>%
inner_join(bing) %>%
count(id, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
inner_join(tw_df[,c(5,8)]) #join on id and created
library(cowplot)
library(scales)
library(lubridate)
#adjust time zone of tweets with lubridate
conf_sent$created<-ymd_hms(conf_sent$created,tz='EST')
#Example could include label, but don't have time to figure out what is driving
#inflection points of moods during these other conferences
df_labels<-data.frame(times=strptime(c("2016-07-13 12:00:00","2016-07-15 0:00:00","2016-07-16 16:30:00","2016-07-18 6:30:00"),"%Y-%m-%d %H:%M:%S"),
labels=c("it begins!\nmixers for all","science cafe\nfunny-man",'final day\nmixer stuff','that was pretty\ngood reflection'),
y=c(1.5,1.0,1.0,1.0))
ggplot(conf_sent, aes(created, sentiment)) +
geom_smooth() + xlab("tweet time") + ylab("tweet sentiment")+
scale_x_datetime(breaks = date_breaks("day")) + background_grid(major = "xy", minor = "none") +
theme(axis.text.x=element_text(angle=315,vjust=.6))+
#geom_text(data=df_labels,aes(x=times,y=y,label=labels),size=4)+
ggtitle(paste(hashtag,"positive or negative emotions (think first order ~vibe of conf.)"))
#coord_cartesian(ylim=c(-.5,1.2)) #+geom_text(data=df_labels,aes(x=times,y=y,label=labels),size=4)
library(twitteR)
library(tidytext)
library(ggplot2)
library(dplyr)
library(igraph)
library(stringr)
hm<-sort(table(tw_df$screenName))
outdf<-data.frame(screen_name=names(hm),num_tweets=hm)[,c(1,3)]
outdf<-outdf[order(outdf[,2],decreasing=T),]
#OK, start of new code to develop RT network
#code (regex especially!!!) used liberally from
# https://sites.google.com/site/miningtwitter/questions/user-tweets/who-retweet
#TODO:
#replace retweet network construction (not plotting)
#with https://github.com/nfahlgren/conference_twitter_stats/blob/master/retweet_network_generic.R
#it's cleaner and doesn't rely on regex horrors I don't understand
rt_net<-grep("(RT|via)((?:\\b\\W*@\\w+)+)", tw_df$text,
ignore.case=TRUE,value=TRUE)
rt_neti<-grep("(RT|via)((?:\\b\\W*@\\w+)+)", tw_df$text,
ignore.case=TRUE)
#next, create list to store user names
who_retweet <- as.list(1:length(rt_net))
who_post <- as.list(1:length(rt_net))
# for loop
for (i in 1:length(rt_net))
{
# get tweet with retweet entity
#nrow= ???
twit <- tw_df[rt_neti[i],]
# get retweet source
poster<-str_extract_all(twit$text,"(RT|via)((?:\\b\\W*@\\w+)+)")
#remove ':'
poster <- gsub(":", "", unlist(poster))
# name of retweeted user
who_post[[i]] <- gsub("(RT @|via @)", "", poster, ignore.case=TRUE)
# name of retweeting user
who_retweet[[i]] <- rep(twit$screenName, length(poster))
}
# unlist
who_post <- unlist(who_post)
who_retweet <- unlist(who_retweet)
####
#Preprocessing the dataframes as as contacts to something
#igraph likes
#I guess I need an edge aesthetic for ggraph to paint with
retweeter_poster <- data.frame(from=who_retweet, to=who_post,retweets=1)
#filters out some bad parsing and users who arent in the node graph
#node_df has the screen_name and number of tweets per user, which will serve as the vertex/node dataframe
#in igraph speak
node_df<-outdf
names(node_df)<-c("id","num_tweets")
#This step #REALLLY IMPORTANT# for plotting purposes, determines how dense the network is
#need to tune based on how big you want your input network is
node_df2<-droplevels(node_df[1:50,]) #selecting only the top 50 posting from #evol2016 for plotting purposes
filt_rt_post<-retweeter_poster[retweeter_poster$from %in% node_df2$id & retweeter_poster$to %in% node_df2$id,]
filt_rt_post<-droplevels(filt_rt_post) #ditch all those fleshbags that had to talk to people instead of tweeting
head(filt_rt_post)
## from to retweets
## 43 scopatz dotsdl 1
## 48 SciPyConf JackieKazil 1
## 51 SciPyConf ericmjl 1
## 56 chendaniely jhamrick 1
## 60 geo_leeman dopplershift 1
## 61 jnuneziglesias SciPyConf 1
#this creates a directed graph with vertex/node info on num_tweets, and edge info on retweets
rt_graph<-graph_from_data_frame(d=droplevels(filt_rt_post),vertices=droplevels(node_df2),directed=T)
#simplify the graph to remove any possible self retweets since now twitter is dumb any allows that
#and any multiple edges
#have to wait a couple seconds to let graph be generated before simplify call
#merge all the multiple rts a person has into one edge to simplify visualization
rt_graph<-simplify(rt_graph,remove.multiple=T,remove.loops=TRUE,edge.attr.comb='sum')
library(ggraph)
library(ggplot2)
#jpeg('evol2016_top50_twitter_network.jpg',width=960,height=960,pointsize=12)
ggraph(rt_graph,'igraph',algorithm='kk')+
geom_edge_fan(aes(alpha=retweet),edge_alpha=0.075)+
geom_node_point(aes(size=num_tweets))+
geom_node_text(aes(label=name,vjust=-1.5))+
ggforce::theme_no_axes()+
theme(legend.position=c(.08,.88))
deg.dist <-degree_distribution(rt_graph, cumulative=T, mode="all")
deg_df<-data.frame(deg=0:max(degree(rt_graph)),cum_freq=1-deg.dist)
qplot(deg,cum_freq,data=deg_df,xlab="Degree",ylab="Cumulative Frequency")