1 Introduction

This document describes the creation of topics with the quanteda and topicmodels packages. The texts from which topics are extracted need to be in one column in a data frame. The following code gives a brief overview of the data.

1.1 The data

AllTweets = readRDS("AllTweets.Rds")
dim(AllTweets)

## [1] 98072    17

The texts are tweets that are collected from a selected number of parlement members. They are listed in the following table.

AllTweets$uur = hour(AllTweets$created)
AllTweets %>% 
  group_by(screenName,Partij) %>% summarise(ntweets=n()) %>%
  DT::datatable(
    options = list(
      pageLength = 10, autoWidth = TRUE
      ),
    caption ="selected politicians"
  )

1.2 Some statistics

Some interesting statistics are given by the following graphs. It turns out that politicians are just like ordinary humans, they are less productive in the summer and in weekends.

daily = AllTweets %>% 
  filter(created > "2016-03-15") %>%
  mutate(
    dag = cut(created, breaks="days")
  ) %>%
  group_by(dag ) %>%
  summarise(n=n())

plot_ly(
  daily, type = "bar",  x = ~dag,  y = ~n
) %>%
  layout(
    xaxis = list( tickangle = 45),
    title = "Number of tweets by the politicians per day", 
    yaxis = list(title= "n")
  )

ggplot(AllTweets, aes(uur, fill=Partij)) + 
  geom_bar(aes(y = (..count..)/sum(..count..)),position="dodge") +
  scale_y_continuous(labels = scales::percent) +
  ylab("percentage tweets") +
  ggtitle("tweets per hour over the day per party")

AllTweets %>% 
  mutate(
    dayofweek = lubridate::wday(created, label=TRUE)
  ) %>%
  group_by(dayofweek) %>%
  summarise(n=n()) %>%
  ggplot(aes(dayofweek)) +
  geom_bar(aes(weight = n)) +
  ggtitle("tweets per day of week")

1.3 Clean tweets

Clean texts in tweets, remove punctuations, https etc….

AllTweets = AllTweets %>% 
  mutate(
    clean_tweet = str_replace(text,"RT @[a-z,A-Z]*: ",""),
    clean_tweet = str_replace_all(clean_tweet,  "https://t.co/[a-z,A-Z,0-9]*", ""),
    clean_tweet =  str_replace_all(clean_tweet,"@[a-z,A-Z]*","")   
  )

See the word length of the tweets

## woord lengte
WordLength = str_match_all(
  AllTweets$clean_tweet, 
  '\\s+'
)

AllTweets$AantalWoorden = sapply(WordLength,length)
ggplot(
  data = AllTweets, 
  aes(AantalWoorden)
) + 
  geom_histogram(
    binwidth = 1, 
    col="black"
  ) + 
  facet_grid(~Partij)

AllTweets %>% 
  group_by(Partij) %>% 
  summarise(
    MedianWords = median(AantalWoorden),
    AverageWords = mean(AantalWoorden)
  )

## # A tibble: 10 × 3
##              Partij MedianWords AverageWords
##               <chr>       <dbl>        <dbl>
## 1               CDA          16     15.40702
## 2      ChristenUnie          13     12.14879
## 3               D66          14     12.46918
## 4        GROENLINKS          13     11.86304
## 5  PARTIJvoorDIEREN          15     13.67361
## 6              PVDA          12     11.54191
## 7               PVV          13     11.78429
## 8               SGP          14     12.75528
## 9                SP          13     12.69486
## 10              VVD          14     13.34816

1.4 Creation of corpus

For the analysis I have used only tweets with 5 or more words

## gebruik alleen commentaren met meer dan 5 woorden
AllTweets_5 = filter(AllTweets, AantalWoorden >= 5)
TweetCorp = corpus(AllTweets_5$clean_tweet)

summary(TweetCorp, n=10)

## Corpus consisting of 85688 documents, showing 10 documents.
## 
##    Text Types Tokens Sentences
##   text1    14     17         2
##   text2    18     19         2
##   text3     7      7         1
##   text4     5      6         1
##   text5    18     21         3
##   text6    22     23         3
##   text7    19     20         2
##   text8    15     18         2
##   text9    20     22         2
##  text10     9     10         2
## 
## Source:  /home/longhowlam/RProjects/kamerleden/* on x86_64 by longhowlam
## Created: Mon Jan 30 08:06:23 2017
## Notes:

1.5 The term document matrix

From the all the tweets we now create a term document matrix. First ignore certain words

StopWoorden =  c(
  "amp", "nl", "via", "vd", "ga", "af", "onze", 
  "vvd", "pvda", "cda", "sp", "sgp", "we", "and", "the", 
  "to", "no", "with", "this", "http","t.co",
  stopwords("english"),
  stopwords("dutch"), 
  letters
)

TweetCorpdfm = dfm(
  TweetCorp,
  ignoredFeatures = StopWoorden,
  stem = FALSE,
  ngrams = 1:2
)

## Creating a dfm from a corpus ...

## 
##    ... lowercasing

## 
##    ... tokenizing

## 
##    ... indexing documents: 85,688 documents

## 
##    ... indexing features:

## 306,149 feature types

##

## ...

## removed 146,884 features, from 305 supplied (glob) feature types

##    ... created a 85688 x 159266 sparse dfm
##    ... complete. 
## Elapsed time: 26.1 seconds.

The dimension of the term document matrix is:

dim(TweetCorpdfm)

## [1]  85688 159266

The top 50 words occuring in the data are given by

## nodig om later te kunnen koppelen
AllTweets_5$textID = row.names(TweetCorpdfm)


### top features / words
topfeatures( TweetCorpdfm, n = 50)

##       wel      goed   vandaag nederland     rutte     kamer      weer 
##      2950      2924      2613      2466      2464      2315      2301 
##     debat      gaat      jaar    mensen        eu      mooi    vragen 
##      2270      2256      2129      2116      2095      1866      1827 
##       pvv    nieuwe      alle      gaan      dank      zorg     motie 
##      1694      1672      1667      1658      1656      1621      1591 
##   turkije    moeten   kabinet  minister      waar       mee     eigen 
##      1569      1500      1426      1396      1354      1319      1311 
##     mooie       nee      heel     samen       dag     nodig      tijd 
##      1302      1256      1201      1183      1172      1147      1137 
##      komt      zegt     islam    tweede    minder     maken      land 
##      1109      1101      1100      1084      1083      1071      1069 
##     steun     goede      geld      even       zie    alleen       weg 
##      1042      1037      1036       970       970       968       963 
##    waarom 
##       949

plot(
  TweetCorpdfm, 
  max.words = 200, 
  scale=c(1.5,.25),
  colors = brewer.pal(8, "Dark2")
)

2 Topics per political party

Now apply latent dirichlet allocation (LDA) per political party. For each politcal party an LDA is performed.

k = 4
NTERMS = 30
ResultsLDA = list()

partijen = names(table(AllTweets_5$Partij))

### reduce terms
for(partij in partijen)
{
  
  tweets_eenPartij = filter(AllTweets_5, Partij == partij)
  TweetCorp = corpus(tweets_eenPartij$clean_tweet)
  TweetCorpdfm = dfm(
    TweetCorp,
    ignoredFeatures = StopWoorden,
    stem = FALSE,
    ngrams = 1:2
  )
  TweetCorpdfm_reduced <- trim(TweetCorpdfm, minCount = 5, minDoc = 5)
  print(dim(TweetCorpdfm_reduced))
  ResultsLDA[[partij]] = LDA(convert(TweetCorpdfm_reduced, to = "topicmodels"), k = k)

  
}

## [1] 8756 5069
## [1] 2473 1418
## [1] 7965 4201
## [1] 7133 2901
## [1] 1024  864
## [1] 10483  5211
## [1] 18034 12248
## [1] 3291 2172
## [1] 11013  5432
## [1] 15516  7538

Table of terms per topic per political party

ALLTOPICTERMS = NULL
for(partij in partijen)
{
  TOPICTERMS = as.data.frame(get_terms( ResultsLDA[[partij]], NTERMS))
  TOPICTERMS$Partij = partij
  ALLTOPICTERMS = rbind(ALLTOPICTERMS, TOPICTERMS)
}

DT::datatable(
  ALLTOPICTERMS, 
  options = list(
    pageLength = 30, autoWidth = TRUE
  ),
  caption = paste("Topics in tweets")
)

2.1 Word clouds

Word cloud visualisations per politcal party

##### visualisatie wordclouds per topic ####
for(partij in partijen)
{
  TweetTopics = ResultsLDA[[partij]]
  png(paste0(partij,".png"),  width = 700, height = 700)
  par(mfrow=c(2,2))
  
  for (i in 1:k)
  {
    Topic = i
    TopN = 100
  
    TermProbsPerTopic = data.frame(
      Term = TweetTopics@terms, 
      TermProb = TweetTopics@beta[Topic,],
      stringsAsFactors = FALSE
    )
    TermProbsPerTopic = arrange(TermProbsPerTopic, desc(TermProb)) [1:TopN,]
  
    pal = brewer.pal(9,"BuGn")
    pal <- pal[-(1:4)]
  
   
    wordcloud(
      TermProbsPerTopic$Term,
      scale=c(4,.15), 
      exp(TermProbsPerTopic$TermProb)*1000,
      color = pal,
      min.freq = 1
    )
    title(paste("Topic:" ,i))
  
  }
  
  dev.off()
}

2.2 word cloud results

The following figures are the results of the code above.

Topics in tweets of some Dutch politicians

Topic modeling with quanteda and topic models

1 Introduction

1.1 The data

1.2 Some statistics

1.3 Clean tweets

1.4 Creation of corpus

1.5 The term document matrix

2 Topics per political party

2.1 Word clouds

2.2 word cloud results

2.2.1 VVD Topics

2.2.2 CDA topics

2.2.3 SP topics

2.2.4 SGP topics

2.2.5 Partij voor de dieren topics

2.2.6 D66 topics

2.2.7 PvdA topic

2.2.8 Groenlinks topics

2.2.9 PVV topics