This document describes the creation of topics with the quanteda
and topicmodels
packages. The texts from which topics are extracted need to be in one column in a data frame. The following code gives a brief overview of the data.
AllTweets = readRDS("AllTweets.Rds")
dim(AllTweets)
## [1] 98072 17
The texts are tweets that are collected from a selected number of parlement members. They are listed in the following table.
AllTweets$uur = hour(AllTweets$created)
AllTweets %>%
group_by(screenName,Partij) %>% summarise(ntweets=n()) %>%
DT::datatable(
options = list(
pageLength = 10, autoWidth = TRUE
),
caption ="selected politicians"
)
Some interesting statistics are given by the following graphs. It turns out that politicians are just like ordinary humans, they are less productive in the summer and in weekends.
daily = AllTweets %>%
filter(created > "2016-03-15") %>%
mutate(
dag = cut(created, breaks="days")
) %>%
group_by(dag ) %>%
summarise(n=n())
plot_ly(
daily, type = "bar", x = ~dag, y = ~n
) %>%
layout(
xaxis = list( tickangle = 45),
title = "Number of tweets by the politicians per day",
yaxis = list(title= "n")
)
ggplot(AllTweets, aes(uur, fill=Partij)) +
geom_bar(aes(y = (..count..)/sum(..count..)),position="dodge") +
scale_y_continuous(labels = scales::percent) +
ylab("percentage tweets") +
ggtitle("tweets per hour over the day per party")
AllTweets %>%
mutate(
dayofweek = lubridate::wday(created, label=TRUE)
) %>%
group_by(dayofweek) %>%
summarise(n=n()) %>%
ggplot(aes(dayofweek)) +
geom_bar(aes(weight = n)) +
ggtitle("tweets per day of week")
Clean texts in tweets, remove punctuations, https etc….
AllTweets = AllTweets %>%
mutate(
clean_tweet = str_replace(text,"RT @[a-z,A-Z]*: ",""),
clean_tweet = str_replace_all(clean_tweet, "https://t.co/[a-z,A-Z,0-9]*", ""),
clean_tweet = str_replace_all(clean_tweet,"@[a-z,A-Z]*","")
)
See the word length of the tweets
## woord lengte
WordLength = str_match_all(
AllTweets$clean_tweet,
'\\s+'
)
AllTweets$AantalWoorden = sapply(WordLength,length)
ggplot(
data = AllTweets,
aes(AantalWoorden)
) +
geom_histogram(
binwidth = 1,
col="black"
) +
facet_grid(~Partij)
AllTweets %>%
group_by(Partij) %>%
summarise(
MedianWords = median(AantalWoorden),
AverageWords = mean(AantalWoorden)
)
## # A tibble: 10 × 3
## Partij MedianWords AverageWords
## <chr> <dbl> <dbl>
## 1 CDA 16 15.40702
## 2 ChristenUnie 13 12.14879
## 3 D66 14 12.46918
## 4 GROENLINKS 13 11.86304
## 5 PARTIJvoorDIEREN 15 13.67361
## 6 PVDA 12 11.54191
## 7 PVV 13 11.78429
## 8 SGP 14 12.75528
## 9 SP 13 12.69486
## 10 VVD 14 13.34816
For the analysis I have used only tweets with 5 or more words
## gebruik alleen commentaren met meer dan 5 woorden
AllTweets_5 = filter(AllTweets, AantalWoorden >= 5)
TweetCorp = corpus(AllTweets_5$clean_tweet)
summary(TweetCorp, n=10)
## Corpus consisting of 85688 documents, showing 10 documents.
##
## Text Types Tokens Sentences
## text1 14 17 2
## text2 18 19 2
## text3 7 7 1
## text4 5 6 1
## text5 18 21 3
## text6 22 23 3
## text7 19 20 2
## text8 15 18 2
## text9 20 22 2
## text10 9 10 2
##
## Source: /home/longhowlam/RProjects/kamerleden/* on x86_64 by longhowlam
## Created: Mon Jan 30 08:06:23 2017
## Notes:
From the all the tweets we now create a term document matrix. First ignore certain words
StopWoorden = c(
"amp", "nl", "via", "vd", "ga", "af", "onze",
"vvd", "pvda", "cda", "sp", "sgp", "we", "and", "the",
"to", "no", "with", "this", "http","t.co",
stopwords("english"),
stopwords("dutch"),
letters
)
TweetCorpdfm = dfm(
TweetCorp,
ignoredFeatures = StopWoorden,
stem = FALSE,
ngrams = 1:2
)
## Creating a dfm from a corpus ...
##
## ... lowercasing
##
## ... tokenizing
##
## ... indexing documents: 85,688 documents
##
## ... indexing features:
## 306,149 feature types
##
## ...
## removed 146,884 features, from 305 supplied (glob) feature types
## ... created a 85688 x 159266 sparse dfm
## ... complete.
## Elapsed time: 26.1 seconds.
The dimension of the term document matrix is:
dim(TweetCorpdfm)
## [1] 85688 159266
The top 50 words occuring in the data are given by
## nodig om later te kunnen koppelen
AllTweets_5$textID = row.names(TweetCorpdfm)
### top features / words
topfeatures( TweetCorpdfm, n = 50)
## wel goed vandaag nederland rutte kamer weer
## 2950 2924 2613 2466 2464 2315 2301
## debat gaat jaar mensen eu mooi vragen
## 2270 2256 2129 2116 2095 1866 1827
## pvv nieuwe alle gaan dank zorg motie
## 1694 1672 1667 1658 1656 1621 1591
## turkije moeten kabinet minister waar mee eigen
## 1569 1500 1426 1396 1354 1319 1311
## mooie nee heel samen dag nodig tijd
## 1302 1256 1201 1183 1172 1147 1137
## komt zegt islam tweede minder maken land
## 1109 1101 1100 1084 1083 1071 1069
## steun goede geld even zie alleen weg
## 1042 1037 1036 970 970 968 963
## waarom
## 949
plot(
TweetCorpdfm,
max.words = 200,
scale=c(1.5,.25),
colors = brewer.pal(8, "Dark2")
)
Now apply latent dirichlet allocation (LDA) per political party. For each politcal party an LDA is performed.
k = 4
NTERMS = 30
ResultsLDA = list()
partijen = names(table(AllTweets_5$Partij))
### reduce terms
for(partij in partijen)
{
tweets_eenPartij = filter(AllTweets_5, Partij == partij)
TweetCorp = corpus(tweets_eenPartij$clean_tweet)
TweetCorpdfm = dfm(
TweetCorp,
ignoredFeatures = StopWoorden,
stem = FALSE,
ngrams = 1:2
)
TweetCorpdfm_reduced <- trim(TweetCorpdfm, minCount = 5, minDoc = 5)
print(dim(TweetCorpdfm_reduced))
ResultsLDA[[partij]] = LDA(convert(TweetCorpdfm_reduced, to = "topicmodels"), k = k)
}
## [1] 8756 5069
## [1] 2473 1418
## [1] 7965 4201
## [1] 7133 2901
## [1] 1024 864
## [1] 10483 5211
## [1] 18034 12248
## [1] 3291 2172
## [1] 11013 5432
## [1] 15516 7538
Table of terms per topic per political party
ALLTOPICTERMS = NULL
for(partij in partijen)
{
TOPICTERMS = as.data.frame(get_terms( ResultsLDA[[partij]], NTERMS))
TOPICTERMS$Partij = partij
ALLTOPICTERMS = rbind(ALLTOPICTERMS, TOPICTERMS)
}
DT::datatable(
ALLTOPICTERMS,
options = list(
pageLength = 30, autoWidth = TRUE
),
caption = paste("Topics in tweets")
)
Word cloud visualisations per politcal party
##### visualisatie wordclouds per topic ####
for(partij in partijen)
{
TweetTopics = ResultsLDA[[partij]]
png(paste0(partij,".png"), width = 700, height = 700)
par(mfrow=c(2,2))
for (i in 1:k)
{
Topic = i
TopN = 100
TermProbsPerTopic = data.frame(
Term = TweetTopics@terms,
TermProb = TweetTopics@beta[Topic,],
stringsAsFactors = FALSE
)
TermProbsPerTopic = arrange(TermProbsPerTopic, desc(TermProb)) [1:TopN,]
pal = brewer.pal(9,"BuGn")
pal <- pal[-(1:4)]
wordcloud(
TermProbsPerTopic$Term,
scale=c(4,.15),
exp(TermProbsPerTopic$TermProb)*1000,
color = pal,
min.freq = 1
)
title(paste("Topic:" ,i))
}
dev.off()
}
The following figures are the results of the code above.