R Scripts for Twitter Sentiment Analysis
This project's Twitter sentiment analysis was completed using the R language and accompanying libraries. Below, you'll find the two scripts responsible for Tweet collection, sentiment analysis, and sentiment visualizations.
The following R script is primarily responsible for connecting to the Twitter API, collecting tweets, refining them, calculating their sentiments, and creating sentiment visualizations in the form of bar plots.
# Load required R packages
library(NLP)
library(twitteR)
library(syuzhet)
library(tm)
library(ROAuth)
library(ggplot2)
setwd(dir=“XXXX”)
# Input credentials and establish Twitter connection
consumer_key <- “XXXX”
consumer_secret <- “XXXX”
access_token <- “XXXX”
access_secret <- “XXXX”
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
# Search Twitter with search term, number of Tweets, language, geocode, and date range
tweets_covid <- searchTwitter("coronavirus", n=150,lang = "en",
geocode = “[INSERT LAT],[INSERT LONG],[INSERT RADIUS]”,
since=“DATE”, until=“DATE”)
covid_text<- covid_df$text
# Create text corpus based on the text of collected Tweets
covidCorpus <- Corpus(VectorSource(covid_df$text))
#Format text prior to analysis
# Make lowercase
covidCorpus <- tm_map(covidCorpus, content_transformer(tolower))
# Remove numbers
covidCorpus <- tm_map(covidCorpus, removeNumbers)
# Remove punctuation
covidCorpus <- tm_map(covidCorpus, removePunctuation)
# Remove whitespace
covidCorpus <- tm_map(covidCorpus, stripWhitespace)
# Read from large list of stopwords (Developed from NLTK's list of english stopwords - Github)
tweetStopwords <- readLines("stopwords-big")
# Remove stopwords from Tweets
covidCorpus <- tm_map(covidCorpus,removeWords,tweetStopwords)
# Partial processing function from CateGitau (Github)
Textprocessing <- function(x)
{gsub("http[[:alnum:]]*",'', x)
gsub('http\\S+\\s*', '', x) ## Remove URLs
gsub('#\\S+', '', x) ## Remove Hashtags
gsub('@\\S+', '', x) ## Remove Mentions
gsub('[[:cntrl:]]', '', x) ## Remove Controls and special characters
gsub("\\d", '', x) ## Remove Controls and special characters
}
covidCorpus <- tm_map(covidCorpus,Textprocessing)
# Associate sentiments and caluclate sentiment score
mysentiment_covid<-get_nrc_sentiment((covid_text))
Sentimentscores_covid<-data.frame(colSums(mysentiment_covid[,]))
# Sentiment name formatting
names(Sentimentscores_covid)<-"Score"
Sentimentscores_covid<-cbind("sentiment"=rownames(Sentimentscores_covid),Sentimentscores_covid)
rownames(Sentimentscores_covid)<-NULL
# View the head of available sentiments
head(Sentimentscores_covid)
# Plot barplot of only positive and negative sentiments and add count on top of each bar
ggplot(data=Sentimentscores_covid[9:10,],aes(x=sentiment,y=Score))
+geom_bar(aes(fill=sentiment),stat = "identity")
+theme(legend.position="none")
+xlab("Sentiment")+ylab("Score")
+ggtitle("Sentiments of Tweets on Coronavirus: [Insert County Name & Date Range Here]")
+geom_text(stat='identity', aes(label=Score), vjust=-1)
The next R script makes use of the positive and negative sentiment counts calculated by the above script. With these counts, this script creates choropleth maps (commonly known as heat maps), to visually differentiate positive and negative sentiment densities between New Jersey counties.
# Load map data and plotting packages
library(ggplot2)
library(ggmap)
library(maps)
library(mapdata)
# Grab state data and begin to create base state map
usa <- map_data("usa")
states <- map_data("state")
nj_df <- subset(states, region == "new jersey")
counties <- map_data("county")
nj_county <- subset(counties, region == "new jersey")
# Plot base state map and county borders
nj_base <- ggplot(data = nj_df, mapping = aes(x = long, y = lat, group = group)) +
coord_fixed(1.3) +
geom_polygon(color = "black", fill = "gray")
nj_base + theme_nothing()
nj_base + theme_nothing() +
geom_polygon(data = nj_county, fill = NA, color = "white") +
geom_polygon(color = "black", fill = NA)
# Read spreadsheet with count values (Pos/Neg Tweet Sentiments)
setwd("[INSERT FILEPATH HERE]")
nj_sent<-read.csv("NJCountyMap.csv")
# Merge datasets by county column (renamed to subregion)
njtwt <- merge(nj_county, nj_sent, by = "subregion")
# Plot Positive Tweet Sentiments Choropleth Map
nj_pos_map <- nj_base +
geom_polygon(data = njtwt, aes(fill = positive_sentiments), color = "white") +
geom_polygon(color = "black", fill = NA)
+scale_fill_gradient(low="green", high="darkgreen")+theme_bw()
+ggtitle("Positive Tweet Sentiments in New Jersey, by County")
nj_pos_map + labs(fill ="# of Tweets")
# Plot Negative Tweet Sentiments Choropleth Map
nj_neg_map <- nj_base +
geom_polygon(data = njtwt, aes(fill = negative_sentiments), color = "white")
+geom_polygon(color = "black", fill = NA)
+scale_fill_gradient(low="red", high="darkred")
+theme_bw()+ggtitle("Negative Tweet Sentiments in New Jersey, by County")
nj_neg_map + labs(fill ="# of Tweets")