Y T E X T > text <- c("Because I could not stop for Death -", + "He kindly stopped for me -", + "The Carriage held but just Ourselves -", + "and Immortality") > > text [1] "Because I could not stop for Death -" [2] "He kindly stopped for me -" [3] "The Carriage held but just Ourselves -" [4] "and Immortality"
Y T E X T > library(tidytext) > text_df %>% + unnest_tokens(word, text) # A tibble: 20 x 2 line word <int> <chr> 1 1 because 2 1 i 3 1 could 4 1 not 5 1 stop 6 1 for 7 1 death 8 2 he 9 2 kindly 10 2 stopped 11 2 for 12 2 me 13 3 the
Y T E X T > library(tidytext) > text_df %>% + unnest_tokens(word, text) # A tibble: 20 x 2 line word <int> <chr> 1 1 because 2 1 i 3 1 could 4 1 not 5 1 stop 6 1 for 7 1 death 8 2 he 9 2 kindly 10 2 stopped 11 2 for 12 2 me • Other columns have been retained • Punctuation has been stripped • Words have been converted to lowercase
Y T E X T > tidy_books <- original_books %>% + unnest_tokens(word, text) > > tidy_books # A tibble: 725,055 x 4 book linenumber chapter word <fct> <int> <int> <chr> 1 Sense & Sensibility 1 0 sense 2 Sense & Sensibility 1 0 and 3 Sense & Sensibility 1 0 sensibility 4 Sense & Sensibility 3 0 by 5 Sense & Sensibility 3 0 jane 6 Sense & Sensibility 3 0 austen 7 Sense & Sensibility 5 0 1811 8 Sense & Sensibility 10 1 chapter 9 Sense & Sensibility 10 1 1 10 Sense & Sensibility 13 1 the # ... with 725,045 more rows
T > get_stopwords() # A tibble: 175 x 2 word lexicon <chr> <chr> 1 i snowball 2 me snowball 3 my snowball 4 myself snowball 5 we snowball 6 our snowball 7 ours snowball 8 ourselves snowball 9 you snowball 10 your snowball # ... with 165 more rows
T > get_stopwords(language = "pt") # A tibble: 203 x 2 word lexicon <chr> <chr> 1 de snowball 2 a snowball 3 o snowball 4 que snowball 5 e snowball 6 do snowball 7 da snowball 8 em snowball 9 um snowball 10 para snowball # ... with 193 more rows
T > get_stopwords(source = "smart") # A tibble: 571 x 2 word lexicon <chr> <chr> 1 a smart 2 a's smart 3 able smart 4 about smart 5 above smart 6 according smart 7 accordingly smart 8 across smart 9 actually smart 10 after smart # ... with 561 more rows
> bing_word_counts # A tibble: 2,585 x 3 word sentiment n <chr> <chr> <int> 1 miss negative 1855 2 well positive 1523 3 good positive 1380 4 great positive 981 5 like positive 725 6 better positive 639 7 enough positive 613 8 happy positive 534 9 love positive 495 10 pleasure positive 462 # ... with 2,575 more rows Which words contribute to each sentiment?
> bing_word_counts # A tibble: 2,585 x 3 word sentiment n <chr> <chr> <int> 1 miss negative 1855 2 well positive 1523 3 good positive 1380 4 great positive 981 5 like positive 725 6 better positive 639 7 enough positive 613 8 happy positive 534 9 love positive 495 10 pleasure positive 462 # ... with 2,575 more rows Which words contribute to each sentiment?
book_words # A tibble: 40,379 x 4 book word n total <fct> <chr> <int> <int> 1 Mansfield Park the 6206 160460 2 Mansfield Park to 5475 160460 3 Mansfield Park and 5438 160460 4 Emma to 5239 160996 5 Emma the 5201 160996 6 Emma and 4896 160996 7 Mansfield Park of 4778 160460 8 Pride & Prejudice the 4331 122204 9 Emma of 4291 160996 10 Pride & Prejudice to 4162 122204 # ... with 40,369 more rows
book_words <- book_words %>% + bind_tf_idf(word, book, n) > book_words # A tibble: 40,379 x 7 book word n total tf idf tf_idf <fct> <chr> <int> <int> <dbl> <dbl> <dbl> 1 Mansfield Park the 6206 160460 0.0387 0 0 2 Mansfield Park to 5475 160460 0.0341 0 0 3 Mansfield Park and 5438 160460 0.0339 0 0 4 Emma to 5239 160996 0.0325 0 0 5 Emma the 5201 160996 0.0323 0 0 6 Emma and 4896 160996 0.0304 0 0 7 Mansfield Park of 4778 160460 0.0298 0 0 8 Pride & Prejudice the 4331 122204 0.0354 0 0 9 Emma of 4291 160996 0.0267 0 0 10 Pride & Prejudice to 4162 122204 0.0341 0 0 # ... with 40,369 more rows