This is the capstone project for the Data Science Specialization. In this work we foucs on Natural Languange Processing and word prediction.
Here we load the data from the original dataset, make a sample that consists 20% of the records, and save the samples.
set.seed(5832)
processFile <- function(filepath, sampleProp) {
df = data.frame(readLines(filepath))
return(sample_frac(df, sampleProp))
}
dir.create(file.path(".", "sample"), showWarnings = FALSE)
if(!file.exists("sample/twitter_sample.txt")){
twsample <- processFile("final/en_US/en_US.twitter.txt", 0.2)
write.table(twsample, "sample/twitter_sample.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
}
if(!file.exists("sample/blogs_sample.txt")){
blogsample <- processFile("final/en_US/en_US.blogs.txt", 0.2)
write.table(blogsample, "sample/blogs_sample.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
}
if(!file.exists("sample/news_sample.txt")){
newsample <- processFile("final/en_US/en_US.news.txt", 0.2)
write.table(newsample, "sample/news_sample.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
}
Here we devides the sample into three subset by 60% / 20% / 20%, and use them as training, testing and validation sets respectively.
if(!file.exists("data/training.rds")){
tw <- readLines("sample/twitter_sample.txt")
bl <- readLines("sample/blogs_sample.txt")
nw <- readLines("sample/news_sample.txt")
text = c(tw, bl, nw)
Encoding(text) <- "UTF-8"
# docs <- text %>% strsplit(split = "(?!')[[:punct:]]", perl = TRUE) %>% unlist() %>%data_frame(text = .) %>% filter(!text == "")
docs <- data_frame(text)
set.seed(4869)
intrain <- sample(nrow(docs), 0.6 * nrow(docs))
training <- docs[intrain,]
dir.create(file.path(".", "data"), showWarnings = FALSE)
saveRDS(training, "data/training.rds")
testing <- docs[-intrain, ]
invalid <- sample(nrow(testing), 0.5 * nrow(testing))
validating <- testing[invalid,]
testing <- testing[-invalid,]
saveRDS(validating, "data/validating.rds")
saveRDS(testing, "data/testing.rds")
} else{
training <- readRDS("data/training.rds")
}
bad.words <- read.csv("bad-words.txt", col.names = c("word"),header = FALSE)
We tokenize the text samples into words, remove the bad words and stop words, and do some exploratory analysis.
unigram <- training %>% unnest_tokens(word, text) %>%
filter(!grepl("[+-]?([0-9]*[.])?[0-9]+", word)) %>%
count(word) %>%
anti_join(bad.words) %>%
ungroup() %>%
arrange(desc(n))
## Joining, by = "word"
data(stop_words)
tidystop <- unigram %>% anti_join(stop_words)
## Joining, by = "word"
tidystop %>%
.[1:30,] %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
library(wordcloud)
tidystop %>%
with(wordcloud(word, n, max.words = 120))
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
tidystop %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("#F8766D", "#00BFC4"),
max.words = 120,title.size=1)
## Joining, by = "word"
We tokenize the text into bigrams, trigrams and quadgrams, and remove the tokens that contain bad words as output. We do not remove the stop words since they would be valid predictions.
ngram <- function(text, n){
text %>% unnest_tokens(ngram, text, token = "ngrams", n = n) %>%
filter(!grepl("[+-]?([0-9]*[.])?[0-9]+", ngram)) %>%
count(ngram) %>%
extract(ngram, into = c("first", "last"), '(.*)\\s+([^ ]+)$') %>%
anti_join(bad.words, by = c("last" = "word"))%>%
ungroup() %>%
arrange(desc(n))
}
bigram <- ngram(training, 2)
trigram <- ngram(training, 3)
quadgram <- ngram(training, 4)
calcDiscount <- function(ngramTable){
freqTable <- ngramTable %>% group_by(n) %>% summarise(count = n()) %>% ungroup() %>% arrange(n)
freqTable$discount <- 1
for(i in 1:5){
currN = freqTable$count[i]
nextN = freqTable$count[i + 1]
freqTable$discount[i] <- (i + 1)/i * (nextN / currN)
}
freqTable <- freqTable %>% select(n, discount)
ngramTable %>% select(first, last, n) %>% left_join(freqTable) %>% ungroup()
}
bigramTable <- calcDiscount(bigram)
## Joining, by = "n"
trigramTable <- calcDiscount(trigram)
## Joining, by = "n"
quadgramTable <- calcDiscount(quadgram)
## Joining, by = "n"
unigram$p = unigram$n / sum(unigram$n)
dir.create(file.path(".", "ngrams"), showWarnings = FALSE)
saveRDS(unigram, "ngrams/unigrams.rds")
saveRDS(bigramTable, "ngrams/bigrams.rds")
saveRDS(trigramTable, "ngrams/trigrams.rds")
saveRDS(quadgramTable, "ngrams/quadgrams.rds")
calcProb <- function(f, l, n){
three = tail(unlist(strsplit(f," ")),3)
leftover <- 1
quadtab <- quadgramTable %>% filter(first == paste(three, collapse = " "))
r <- quadtab %>% filter(last == l)
if (nrow(r) > 0) {
# print("quadgram hit!")
d = r$discount[1]
n = r$n[1]
return(d * n / sum(quadtab$n))
}
if (nrow(quadtab) > 1)
leftover <- 1 - sum(quadtab$discount * quadtab$n) / sum(quadtab$n)
tritab <- trigramTable %>% filter(first == paste(tail(three,2), collapse = " "))
tritab <- tritab %>% anti_join(quadtab, by = "last")
r <- tritab %>% filter(last == l)
if (nrow(r) > 0) {
# print("trigram hit!")
d = r$discount[1]
n = r$n[1]
return(leftover * d * n / sum(tritab$n))
}
if (nrow(tritab) > 1)
leftover <- leftover * ( 1 - sum(tritab$discount * tritab$n) / sum(tritab$n))
bitab <- bigramTable %>% filter(first == tail(three, 1))
bitab <- bitab %>% anti_join(quadtab, by = "last")
bitab <- bitab %>% anti_join(tritab, by = "last")
r <- bitab %>% filter(last == l)
if (nrow(r) > 0) {
print("bigram hit!")
d = r$discount[1]
n = r$n[1]
return(leftover * d * n / sum(bitab$n))
}
leftover <- leftover * ( 1 - sum(bitab$discount * bitab$n) / sum(bitab$n))
return(leftover * filter(unigram, word == l)$p)
}
calcProb("i am", "a")
## [1] 0.05531674
backoffPred <- function(f){
three = tail(unlist(strsplit(f," ")),3)
leftover <- 1.0
pred = data_frame()
quadtab <- quadgramTable %>% filter(first == paste(three, collapse = " "))
if (nrow(quadtab) > 0) {
# print("quadgram hit!")
pred <- quadtab %>% mutate(prob = n * discount / sum(quadtab$n)) %>% select(last, prob)
leftover <- 1 - sum(pred$prob)
if (leftover < 0.1)
return(pred %>% arrange(desc(prob)) %>% head(10))
}
tritab <- trigramTable %>% filter(first == paste(tail(three,2), collapse = " ")) %>% anti_join(quadtab, by = "last")
if (nrow(tritab) > 0) {
# print("trigram hit!")
pred <- rbind(pred, tritab %>% mutate(prob = leftover * n * discount / sum(tritab$n)) %>% select(last, prob))
leftover <- 1 - sum(pred$prob)
if (leftover < 0.1)
return(pred %>% arrange(desc(prob)) %>% head(10))
}
bitab <- bigramTable %>% filter(first == tail(three, 1)) %>% anti_join(quadtab, by = "last") %>% anti_join(tritab, by = "last")
if (nrow(bitab) > 0) {
# print("bigram hit!")
pred <- rbind(pred, bitab %>% ungroup() %>% mutate(prob = leftover * n * discount / sum(bitab$n)) %>% select(last, prob))
}
return(pred %>% arrange(desc(prob)) %>% head(10))
}
backoffPred("this is a")
last | prob |
---|---|
great | 0.0655926 |
good | 0.0517837 |
very | 0.0322209 |
free | 0.0212891 |
lot | 0.0212891 |
perfect | 0.0212891 |
beer | 0.0212891 |
dangerous | 0.0212891 |
big | 0.0126582 |
joke | 0.0122660 |
We use the test dataset to evaluate our n-gram model.
set.seed(3825)
testing <- readRDS("data/testing.rds")
quadgram <- ngram(testing, 4)
Perplexity measures the average inverse probability of the test, normalized by the number of words. A smalller perplexity means a higher probability.