Introduction

This is the capstone project for the Data Science Specialization. In this work we foucs on Natural Languange Processing and word prediction.

Loading data

Here we load the data from the original dataset, make a sample that consists 20% of the records, and save the samples.

set.seed(5832)

processFile <- function(filepath, sampleProp) {
  df = data.frame(readLines(filepath))
  return(sample_frac(df, sampleProp))
}


dir.create(file.path(".", "sample"), showWarnings = FALSE)
if(!file.exists("sample/twitter_sample.txt")){
  twsample <- processFile("final/en_US/en_US.twitter.txt", 0.2)
  write.table(twsample, "sample/twitter_sample.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
}
if(!file.exists("sample/blogs_sample.txt")){
  blogsample <- processFile("final/en_US/en_US.blogs.txt",  0.2)
  write.table(blogsample, "sample/blogs_sample.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
}
if(!file.exists("sample/news_sample.txt")){
  newsample <- processFile("final/en_US/en_US.news.txt", 0.2)
  write.table(newsample, "sample/news_sample.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
}

Here we devides the sample into three subset by 60% / 20% / 20%, and use them as training, testing and validation sets respectively.

if(!file.exists("data/training.rds")){
  tw <- readLines("sample/twitter_sample.txt")
  bl <- readLines("sample/blogs_sample.txt")
  nw <- readLines("sample/news_sample.txt")
  text = c(tw, bl, nw)
  Encoding(text) <- "UTF-8"
  # docs <- text %>% strsplit(split = "(?!')[[:punct:]]", perl = TRUE) %>% unlist() %>%data_frame(text = .) %>% filter(!text == "")
  docs <- data_frame(text)
  set.seed(4869)
  intrain <- sample(nrow(docs), 0.6 * nrow(docs))
  training <- docs[intrain,]
  dir.create(file.path(".", "data"), showWarnings = FALSE)
  saveRDS(training, "data/training.rds")
  testing <- docs[-intrain, ]
  invalid <- sample(nrow(testing), 0.5 * nrow(testing))
  validating <- testing[invalid,]
  testing <- testing[-invalid,]
  saveRDS(validating, "data/validating.rds")
  saveRDS(testing, "data/testing.rds")
} else{
  training <- readRDS("data/training.rds")
}

Bad Words

bad.words <- read.csv("bad-words.txt", col.names = c("word"),header = FALSE)

Unigram Exploration

We tokenize the text samples into words, remove the bad words and stop words, and do some exploratory analysis.

unigram <- training %>% unnest_tokens(word, text) %>%
    filter(!grepl("[+-]?([0-9]*[.])?[0-9]+", word)) %>% 
      count(word) %>%
          anti_join(bad.words) %>%
            ungroup() %>%
              arrange(desc(n))
## Joining, by = "word"
data(stop_words)
tidystop <- unigram %>% anti_join(stop_words)
## Joining, by = "word"
tidystop %>%
   .[1:30,] %>%
   mutate(word = reorder(word, n)) %>%
   ggplot(aes(word, n)) +
   geom_col() +
   xlab(NULL) +
   coord_flip()

library(wordcloud)
tidystop %>%
  with(wordcloud(word, n, max.words = 120))

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
tidystop %>%
 inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 120,title.size=1)
## Joining, by = "word"

Generating n-grams

We tokenize the text into bigrams, trigrams and quadgrams, and remove the tokens that contain bad words as output. We do not remove the stop words since they would be valid predictions.

ngram <- function(text, n){
  text %>% unnest_tokens(ngram, text, token = "ngrams", n = n) %>%
    filter(!grepl("[+-]?([0-9]*[.])?[0-9]+", ngram)) %>% 
      count(ngram) %>%
       extract(ngram, into = c("first", "last"), '(.*)\\s+([^ ]+)$') %>%
          anti_join(bad.words, by = c("last" = "word"))%>%
            ungroup() %>%
              arrange(desc(n))
}

bigram <- ngram(training, 2)
trigram <- ngram(training, 3)
quadgram <- ngram(training, 4)

Good-Turing Discount

calcDiscount <- function(ngramTable){

  freqTable <- ngramTable %>% group_by(n) %>% summarise(count = n()) %>% ungroup() %>% arrange(n)
  freqTable$discount <- 1
  for(i in 1:5){
    currN = freqTable$count[i]
    nextN = freqTable$count[i + 1]
    freqTable$discount[i] <- (i + 1)/i * (nextN / currN)
  }
  freqTable <- freqTable %>% select(n, discount)
  ngramTable %>% select(first, last, n) %>% left_join(freqTable) %>% ungroup()
}

bigramTable <- calcDiscount(bigram)
## Joining, by = "n"
trigramTable <- calcDiscount(trigram)
## Joining, by = "n"
quadgramTable <- calcDiscount(quadgram)
## Joining, by = "n"
unigram$p = unigram$n / sum(unigram$n)
dir.create(file.path(".", "ngrams"), showWarnings = FALSE)
saveRDS(unigram, "ngrams/unigrams.rds")
saveRDS(bigramTable, "ngrams/bigrams.rds")
saveRDS(trigramTable, "ngrams/trigrams.rds")
saveRDS(quadgramTable, "ngrams/quadgrams.rds")

Calculate Conditional Probability with Katz Backoff

calcProb <- function(f, l, n){
  three = tail(unlist(strsplit(f," ")),3)
  leftover <- 1
  quadtab <- quadgramTable %>% filter(first == paste(three, collapse = " "))
  r <-  quadtab %>% filter(last == l)
  if (nrow(r) > 0) {
    # print("quadgram hit!")
    d = r$discount[1]
    n = r$n[1]
    return(d * n / sum(quadtab$n))
  }

  if (nrow(quadtab) > 1)
    leftover <-  1 - sum(quadtab$discount * quadtab$n) / sum(quadtab$n)
  
  tritab <- trigramTable %>% filter(first == paste(tail(three,2), collapse = " "))
  tritab <- tritab %>% anti_join(quadtab, by = "last")
  r <-  tritab %>% filter(last == l)
  if (nrow(r) > 0) {
    # print("trigram hit!")
    d = r$discount[1]
    n = r$n[1]
    return(leftover * d * n / sum(tritab$n))
  }
  
  if (nrow(tritab) > 1)
    leftover <-  leftover * ( 1 - sum(tritab$discount * tritab$n) / sum(tritab$n))
  
  bitab <- bigramTable %>% filter(first == tail(three, 1))
  bitab <- bitab %>% anti_join(quadtab, by = "last")
  bitab <- bitab %>% anti_join(tritab, by = "last")
  r <-  bitab %>% filter(last == l)
  if (nrow(r) > 0) {
    print("bigram hit!")
    d = r$discount[1]
    n = r$n[1]
    return(leftover * d * n / sum(bitab$n))
  }

  leftover <- leftover * ( 1 - sum(bitab$discount * bitab$n) / sum(bitab$n))

  return(leftover * filter(unigram, word == l)$p)
}

calcProb("i am", "a")
## [1] 0.05531674

Make prediction

backoffPred <- function(f){
  three = tail(unlist(strsplit(f," ")),3)
  leftover <- 1.0
  
  pred = data_frame()
  quadtab <- quadgramTable %>% filter(first == paste(three, collapse = " "))
  
  if (nrow(quadtab) > 0) {
    # print("quadgram hit!")
    pred <- quadtab %>% mutate(prob = n * discount / sum(quadtab$n)) %>% select(last, prob)
    leftover <-  1 - sum(pred$prob)
    if (leftover < 0.1)
      return(pred %>% arrange(desc(prob)) %>% head(10))
  }
 
  
  tritab <- trigramTable %>% filter(first == paste(tail(three,2), collapse = " ")) %>% anti_join(quadtab, by = "last")
  
  if (nrow(tritab) > 0) {
    # print("trigram hit!")
    pred <- rbind(pred, tritab %>% mutate(prob = leftover * n * discount / sum(tritab$n)) %>% select(last, prob))
    leftover <-  1 - sum(pred$prob)
    if (leftover < 0.1)
      return(pred %>% arrange(desc(prob)) %>% head(10))
  }
  
  bitab <- bigramTable %>% filter(first == tail(three, 1)) %>% anti_join(quadtab, by = "last") %>% anti_join(tritab, by = "last")

  if (nrow(bitab) > 0) {
    # print("bigram hit!")
    pred <- rbind(pred, bitab %>% ungroup() %>% mutate(prob = leftover * n * discount / sum(bitab$n)) %>% select(last, prob))
  }
  
  return(pred %>% arrange(desc(prob)) %>% head(10))
}

backoffPred("this is a")
last prob
great 0.0655926
good 0.0517837
very 0.0322209
free 0.0212891
lot 0.0212891
perfect 0.0212891
beer 0.0212891
dangerous 0.0212891
big 0.0126582
joke 0.0122660

Model evaluation

We use the test dataset to evaluate our n-gram model.

set.seed(3825)
testing <- readRDS("data/testing.rds")
quadgram <- ngram(testing, 4)

Perplexity measures the average inverse probability of the test, normalized by the number of words. A smalller perplexity means a higher probability.

Profiling

Referece

tidytext
ngram
katz’s back off
bad words smoothing and discount CS498JH Introduction to NLP